scripts/gen_tokenizer_data.py at main

zzstoatzz.io / spacez
fork atom
this repo has no description
fork atom
spacez / scripts / gen_tokenizer_data.py
at main 817 lines 28 kB view raw
wrap content
zzstoatzz.io feat: model inference, tokenizer, and full NER pipeline 11d ago
ab1eb3ac
  1"""generate tokenizer_data.zig from spaCy's en_core_web_sm tokenizer config.
  2
  3extracts:
  4  - unicode character class tables (sorted ranges for binary search)
  5  - prefix single-char set + multi-char literals + special rules
  6  - suffix data (single-char set, multi-char literals, lookbehind rules)
  7  - special cases table (1347 entries)
  8
  9the matching LOGIC lives in tokenizer.zig. this script only generates DATA tables.
 10
 11usage:
 12  uv run --python 3.12 --with spacy \
 13    --with 'en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl' \
 14    python scripts/gen_tokenizer_data.py
 15"""
 16
 17import json
 18import re
 19import sre_parse
 20import sys
 21from pathlib import Path
 22
 23
 24def load_spacy():
 25    """load spaCy and extract all tokenizer config."""
 26    import spacy
 27
 28    nlp = spacy.load("en_core_web_sm")
 29    tok = nlp.tokenizer
 30    return tok
 31
 32
 33def extract_ranges(items):
 34    """convert sre_parse IN items to sorted, merged (lo, hi) ranges."""
 35    ranges = []
 36    for op, val in items:
 37        if op == sre_parse.LITERAL:
 38            ranges.append((val, val))
 39        elif op == sre_parse.RANGE:
 40            ranges.append(val)
 41        elif op == sre_parse.CATEGORY:
 42            if val == sre_parse.CATEGORY_DIGIT:
 43                ranges.append((0x30, 0x39))
 44            elif val == sre_parse.CATEGORY_WORD:
 45                ranges.extend([(0x30, 0x39), (0x41, 0x5A), (0x5F, 0x5F), (0x61, 0x7A)])
 46    ranges.sort()
 47    merged = []
 48    for lo, hi in ranges:
 49        if merged and lo <= merged[-1][1] + 1:
 50            merged[-1] = (merged[-1][0], max(merged[-1][1], hi))
 51        else:
 52            merged.append((lo, hi))
 53    return merged
 54
 55
 56def class_from_in_node(in_items):
 57    """extract character class from an IN node, handling NEGATE."""
 58    negated = any(x[0] == sre_parse.NEGATE for x in in_items)
 59    non_neg = [x for x in in_items if x[0] != sre_parse.NEGATE]
 60    ranges = extract_ranges(non_neg)
 61    return ranges, negated
 62
 63
 64# ── prefix data extraction ──
 65
 66
 67def extract_prefix_data(tok):
 68    """extract prefix pattern data: single chars, multi-char literals, char class, specials."""
 69    pat = tok.prefix_search.__self__.pattern
 70    parsed = sre_parse.parse(pat)
 71    branches = parsed[1][1][1]  # AT_BEGINNING, BRANCH
 72
 73    single_chars = []  # codepoints matched as single-char prefix
 74    multi_literals = []  # multi-byte string prefixes
 75    symbol_ranges = []  # the big unicode symbol class
 76    dots = False  # whether ..+ is a prefix
 77    literal_unless_digit = []  # chars like + that don't match before digits
 78
 79    for branch in branches:
 80        if len(branch) == 1:
 81            op, val = branch[0]
 82            if op == sre_parse.LITERAL:
 83                single_chars.append(val)
 84            elif op == sre_parse.IN:
 85                ranges, _ = class_from_in_node(val)
 86                if len(ranges) > 50:
 87                    symbol_ranges = ranges
 88                else:
 89                    # small class — expand to individual chars
 90                    for lo, hi in ranges:
 91                        for cp in range(lo, hi + 1):
 92                            single_chars.append(cp)
 93        elif all(b[0] == sre_parse.LITERAL for b in branch):
 94            s = "".join(chr(b[1]) for b in branch)
 95            multi_literals.append(s)
 96        elif (
 97            len(branch) == 2
 98            and branch[0][0] == sre_parse.LITERAL
 99            and branch[1][0] == sre_parse.MAX_REPEAT
100        ):
101            dots = True
102        elif (
103            len(branch) == 2
104            and branch[0][0] == sre_parse.LITERAL
105            and branch[1][0] == sre_parse.ASSERT_NOT
106        ):
107            literal_unless_digit.append(branch[0][1])
108
109    return {
110        "single_chars": sorted(set(single_chars)),
111        "multi_literals": sorted(multi_literals, key=lambda s: -len(s)),
112        "symbol_ranges": symbol_ranges,
113        "has_dots": dots,
114        "literal_unless_digit": literal_unless_digit,
115    }
116
117
118# ── suffix data extraction ──
119
120
121def extract_suffix_data(tok):
122    """extract suffix pattern data."""
123    pat = tok.suffix_search.__self__.pattern
124    parsed = sre_parse.parse(pat)
125    branches = parsed[0][1][1]  # BRANCH
126
127    single_chars = []
128    multi_literals = []
129    symbol_ranges = []
130    has_dots = False
131    lookbehind_rules = []
132
133    for branch in branches:
134        items = list(branch)
135        if items and items[-1] == (sre_parse.AT, sre_parse.AT_END):
136            items = items[:-1]
137        if not items:
138            continue
139
140        # simple literal(s)
141        if all(x[0] == sre_parse.LITERAL for x in items):
142            s = "".join(chr(x[1]) for x in items)
143            if len(s) == 1:
144                single_chars.append(ord(s))
145            else:
146                multi_literals.append(s)
147            continue
148
149        # character class
150        if len(items) == 1 and items[0][0] == sre_parse.IN:
151            ranges, _ = class_from_in_node(items[0][1])
152            if len(ranges) > 50:
153                symbol_ranges = ranges
154            else:
155                for lo, hi in ranges:
156                    for cp in range(lo, hi + 1):
157                        single_chars.append(cp)
158            continue
159
160        # dots
161        if (
162            len(items) >= 2
163            and items[0] == (sre_parse.LITERAL, ord("."))
164            and items[1][0] == sre_parse.MAX_REPEAT
165        ):
166            has_dots = True
167            continue
168
169        # lookbehind rule
170        if items[0][0] == sre_parse.ASSERT:
171            direction = items[0][1][0]
172            if direction == -1:  # lookbehind
173                rule = _extract_lookbehind_rule(items)
174                if rule:
175                    lookbehind_rules.append(rule)
176            continue
177
178    return {
179        "single_chars": sorted(set(single_chars)),
180        "multi_literals": sorted(multi_literals, key=lambda s: -len(s)),
181        "symbol_ranges": symbol_ranges,
182        "has_dots": has_dots,
183        "lookbehind_rules": lookbehind_rules,
184    }
185
186
187def _extract_lookbehind_rule(items):
188    """extract a suffix lookbehind rule into a serializable structure."""
189    behind_content = items[0][1][1]
190    rest = items[1:]
191
192    # parse lookbehind
193    behind = _parse_assert_content(behind_content)
194    if behind is None:
195        return None
196
197    # parse suffix part
198    suffix = _parse_suffix_part(rest)
199    if suffix is None:
200        return None
201
202    return {"behind": behind, "suffix": suffix}
203
204
205def _parse_assert_content(content):
206    """parse lookbehind/lookahead content into a descriptor."""
207    parts = []
208    for item in content:
209        if item[0] == sre_parse.IN:
210            ranges, negated = class_from_in_node(item[1])
211            parts.append({"type": "class", "ranges": ranges, "negated": negated})
212        elif item[0] == sre_parse.LITERAL:
213            parts.append({"type": "literal", "char": item[1]})
214        else:
215            return None
216    if len(parts) == 1:
217        return parts[0]
218    elif len(parts) > 1:
219        return {"type": "sequence", "parts": parts}
220    return None
221
222
223def _parse_suffix_part(items):
224    """parse the suffix portion after lookbehind."""
225    if all(x[0] == sre_parse.LITERAL for x in items):
226        s = "".join(chr(x[1]) for x in items)
227        return {"type": "literal", "text": s}
228
229    # subpattern with alternatives
230    if len(items) == 1 and items[0][0] == sre_parse.SUBPATTERN:
231        content = items[0][1][3]
232        if content and content[0][0] == sre_parse.BRANCH:
233            alts = []
234            for branch in content[0][1][1]:
235                if all(x[0] == sre_parse.LITERAL for x in branch):
236                    alts.append("".join(chr(x[1]) for x in branch))
237            if alts:
238                return {"type": "alternatives", "texts": alts}
239
240    # BRANCH directly
241    if len(items) == 1 and items[0][0] == sre_parse.BRANCH:
242        alts = []
243        for branch in items[0][1][1]:
244            if all(x[0] == sre_parse.LITERAL for x in branch):
245                alts.append("".join(chr(x[1]) for x in branch))
246        if alts:
247            return {"type": "alternatives", "texts": alts}
248
249    return None
250
251
252# ── unicode class extraction from all patterns ──
253
254
255def extract_named_classes(tok):
256    """extract the specific unicode character classes used across patterns.
257
258    we identify them by their content:
259    - symbol: the big So/Sc class (~174 ranges)
260    - lower: lowercase letters (contains a-z)
261    - upper: uppercase letters (contains A-Z)
262    - alpha: lower + upper
263    - alnum: alpha + digits
264    - lower_or_punct: the wide "not just upper" class used in suffix lookbehinds
265    """
266    classes = {}
267
268    # extract from suffix lookbehinds
269    suffix_pat = tok.suffix_search.__self__.pattern
270    sp = sre_parse.parse(suffix_pat)
271
272    def walk_for_classes(items, label=""):
273        for item in items:
274            op = item[0]
275            if op == sre_parse.IN:
276                ranges, negated = class_from_in_node(item[1])
277                if len(ranges) > 5:
278                    _classify(ranges, classes)
279            elif op == sre_parse.BRANCH:
280                for b in item[1][1]:
281                    walk_for_classes(b, label)
282            elif op in (sre_parse.ASSERT, sre_parse.ASSERT_NOT):
283                walk_for_classes(item[1][1], label)
284            elif op == sre_parse.SUBPATTERN:
285                if item[1][3]:
286                    walk_for_classes(list(item[1][3]), label)
287
288    walk_for_classes(list(sp), "suffix")
289
290    # also from infix
291    infix_pat = tok.infix_finditer.__self__.pattern
292    ip = sre_parse.parse(infix_pat)
293    walk_for_classes(list(ip), "infix")
294
295    return classes
296
297
298def _classify(ranges, classes):
299    """classify a character range set by its content."""
300    range_set = set(ranges)
301
302    # check for a-z presence → lower
303    has_az = (0x61, 0x7A) in range_set
304    has_AZ = (0x41, 0x5A) in range_set
305    has_09 = (0x30, 0x39) in range_set or (0x30, 0x39) in range_set
306
307    n_ranges = len(ranges)
308    n_cp = sum(hi - lo + 1 for lo, hi in ranges)
309
310    if has_az and not has_AZ and not has_09 and n_cp > 1000:
311        if "lower" not in classes or len(ranges) > len(classes["lower"]):
312            classes["lower"] = ranges
313    elif has_AZ and not has_az and not has_09 and n_cp > 1000:
314        if "upper" not in classes or len(ranges) > len(classes["upper"]):
315            classes["upper"] = ranges
316    elif has_az and has_AZ and not has_09 and n_cp > 1000:
317        if "alpha" not in classes or len(ranges) > len(classes["alpha"]):
318            classes["alpha"] = ranges
319    elif has_az and has_AZ and has_09 and n_cp > 1000:
320        if "alnum" not in classes or len(ranges) > len(classes["alnum"]):
321            classes["alnum"] = ranges
322    elif n_cp > 100000 and n_ranges > 300:
323        # very large class — likely "lower_or_punct" or similar
324        key = f"wide_{n_ranges}"
325        classes[key] = ranges
326
327
328# ── special cases ──
329
330
331def extract_specials(tok):
332    """extract special case rules."""
333    entries = []
334    for key, val in sorted(tok.rules.items()):
335        orths = [d[65] for d in val]  # 65 = ORTH
336        entries.append((key, orths))
337    return entries
338
339
340# ── zig code generation ──
341
342
343def zig_str(s):
344    """convert a python string to a zig string literal."""
345    parts = []
346    for c in s:
347        cp = ord(c)
348        if cp < 128:
349            if c == '"':
350                parts.append('\\"')
351            elif c == "\\":
352                parts.append("\\\\")
353            elif c == "\n":
354                parts.append("\\n")
355            elif c == "\t":
356                parts.append("\\t")
357            elif c.isprintable():
358                parts.append(c)
359            else:
360                parts.append(f"\\x{cp:02x}")
361        else:
362            for b in c.encode("utf-8"):
363                parts.append(f"\\x{b:02x}")
364    return '"' + "".join(parts) + '"'
365
366
367def zig_char(cp):
368    """convert a codepoint to a zig u21 literal."""
369    if 32 <= cp < 127 and chr(cp) not in "'\\\"":
370        return f"'{chr(cp)}'"
371    return f"0x{cp:04X}"
372
373
374def gen_range_table(name, ranges):
375    """generate a const range table + lookup function."""
376    lines = []
377    lines.append(f"pub const {name}_ranges = [_][2]u21{{")
378    for lo, hi in ranges:
379        lines.append(f"    .{{ 0x{lo:04X}, 0x{hi:04X} }},")
380    lines.append("};")
381    lines.append("")
382    lines.append(f"pub fn {name}(c: u21) bool {{")
383    lines.append(f"    return rangeContains(&{name}_ranges, c);")
384    lines.append("}")
385    return "\n".join(lines)
386
387
388def gen_codepoint_set(name, codepoints):
389    """generate a switch-based codepoint set."""
390    lines = []
391    lines.append(f"pub fn {name}(c: u21) bool {{")
392    lines.append("    return switch (c) {")
393    # group consecutive codepoints into ranges
394    ranges = []
395    cps = sorted(set(codepoints))
396    i = 0
397    while i < len(cps):
398        start = cps[i]
399        end = start
400        while i + 1 < len(cps) and cps[i + 1] == end + 1:
401            end = cps[i + 1]
402            i += 1
403        ranges.append((start, end))
404        i += 1
405
406    for lo, hi in ranges:
407        if lo == hi:
408            lines.append(f"        {zig_char(lo)} => true,")
409        else:
410            lines.append(f"        {zig_char(lo)}...{zig_char(hi)} => true,")
411    lines.append("        else => false,")
412    lines.append("    };")
413    lines.append("}")
414    return "\n".join(lines)
415
416
417def gen_specials(entries):
418    """generate the special cases StaticStringMap."""
419    max_tokens = max(len(orths) for _, orths in entries)
420    assert max_tokens <= 3, f"max tokens {max_tokens} > 3"
421
422    lines = []
423    lines.append("pub const SpecialCase = struct {")
424    lines.append("    tokens: [3][]const u8,")
425    lines.append("    len: u8,")
426    lines.append("};")
427    lines.append("")
428    lines.append(
429        "pub const specials = std.StaticStringMap(SpecialCase).initComptime(.{"
430    )
431    for key, orths in entries:
432        k = zig_str(key)
433        toks = [zig_str(o) for o in orths]
434        while len(toks) < 3:
435            toks.append('""')
436        tok_str = ", ".join(toks)
437        lines.append(
438            f"    .{{ {k}, SpecialCase{{ .tokens = .{{ {tok_str} }}, .len = {len(orths)} }} }},"
439        )
440    lines.append("});")
441    return "\n".join(lines)
442
443
444def gen_multi_literals(name, literals):
445    """generate an array of multi-char literals for matching."""
446    lines = []
447    lines.append(f"pub const {name} = [_][]const u8{{")
448    for lit in literals:
449        lines.append(f"    {zig_str(lit)},")
450    lines.append("};")
451    return "\n".join(lines)
452
453
454def gen_lookbehind_rules(rules):
455    """generate suffix lookbehind rule data structures."""
456    # identify unique character classes used in lookbehinds
457    class_tables = {}
458    rule_descs = []
459
460    for rule in rules:
461        behind = rule["behind"]
462        suffix = rule["suffix"]
463
464        behind_id = _get_class_id(behind, class_tables)
465        suffix_texts = (
466            [suffix["text"]]
467            if suffix["type"] == "literal"
468            else suffix.get("texts", [])
469        )
470        rule_descs.append(
471            {"behind_id": behind_id, "behind": behind, "suffix_texts": suffix_texts}
472        )
473
474    lines = []
475
476    # generate class tables for lookbehinds
477    for cid, ranges in class_tables.items():
478        lines.append(f"const lookbehind_class_{cid}_ranges = [_][2]u21{{")
479        for lo, hi in ranges:
480            lines.append(f"    .{{ 0x{lo:04X}, 0x{hi:04X} }},")
481        lines.append("};")
482        lines.append("")
483        lines.append(f"pub fn matchLookbehind{cid}(c: u21) bool {{")
484        lines.append(f"    return rangeContains(&lookbehind_class_{cid}_ranges, c);")
485        lines.append("}")
486        lines.append("")
487
488    return "\n".join(lines), rule_descs
489
490
491_class_counter = 0
492_class_cache = {}
493
494
495def _get_class_id(behind, class_tables):
496    global _class_counter
497    if behind["type"] == "class":
498        key = str(behind["ranges"])
499        if key not in _class_cache:
500            cid = _class_counter
501            _class_counter += 1
502            _class_cache[key] = cid
503            class_tables[cid] = behind["ranges"]
504        return _class_cache[key]
505    elif behind["type"] == "sequence":
506        # sequence of tests — generate IDs for each part
507        ids = []
508        for part in behind["parts"]:
509            ids.append(_get_class_id(part, class_tables))
510        return tuple(ids)
511    elif behind["type"] == "literal":
512        return ("literal", behind["char"])
513    return None
514
515
516def generate(tok):
517    """generate the complete tokenizer_data.zig."""
518    print("extracting prefix data...")
519    prefix = extract_prefix_data(tok)
520    print(
521        f"  {len(prefix['single_chars'])} single chars, "
522        f"{len(prefix['multi_literals'])} multi literals, "
523        f"{len(prefix['symbol_ranges'])} symbol ranges"
524    )
525
526    print("extracting suffix data...")
527    suffix = extract_suffix_data(tok)
528    print(
529        f"  {len(suffix['single_chars'])} single chars, "
530        f"{len(suffix['multi_literals'])} multi literals, "
531        f"{len(suffix['lookbehind_rules'])} lookbehind rules"
532    )
533
534    print("extracting unicode classes...")
535    classes = extract_named_classes(tok)
536    print(f"  classes found: {list(classes.keys())}")
537
538    print("extracting specials...")
539    specials = extract_specials(tok)
540    print(f"  {len(specials)} entries")
541
542    # also extract the infix character classes directly
543    infix_pat = tok.infix_finditer.__self__.pattern
544    ip = sre_parse.parse(infix_pat)
545    infix_branches = ip[0][1][1]
546
547    # infix[2] is the symbol class (same as prefix)
548    # infix[3] lookbehind is digits, chars are +-*^, lookahead is digits+hyphen
549    # infix[4] lookbehind is lower/punct, ahead is upper/alpha
550    # infix[5] lookbehind is alpha, ahead is alpha
551    # infix[6] branch alternatives: -, --, ---, ~, en-dash, em-dash, em-dash*2
552    # infix[7] lookbehind is alnum, chars :/~<=>, ahead is alpha
553
554    # extract infix lookbehind/lookahead classes
555    infix_classes = {}
556    for idx in [3, 4, 5, 6, 7]:
557        branch = infix_branches[idx]
558        for item in branch:
559            if item[0] == sre_parse.ASSERT:
560                direction = item[1][0]
561                content = item[1][1]
562                if len(content) == 1 and content[0][0] == sre_parse.IN:
563                    ranges, _ = class_from_in_node(content[0][1])
564                    label = (
565                        f"infix_{idx}_{'behind' if direction == -1 else 'ahead'}"
566                    )
567                    infix_classes[label] = ranges
568
569    # build output
570    sections = []
571    sections.append("//! generated by scripts/gen_tokenizer_data.py — do not edit.")
572    sections.append("//! tokenizer pattern data compiled from spaCy en_core_web_sm.")
573    sections.append("")
574    sections.append('const std = @import("std");')
575    sections.append("")
576
577    # ── utf-8 helpers ──
578    sections.append("// ── utf-8 helpers ──")
579    sections.append("")
580    sections.append("pub const Codepoint = struct { value: u21, len: u3 };")
581    sections.append("")
582    sections.append("pub fn decodeUtf8(bytes: []const u8) ?Codepoint {")
583    sections.append("    if (bytes.len == 0) return null;")
584    sections.append("    const b0 = bytes[0];")
585    sections.append("    if (b0 < 0x80) return .{ .value = b0, .len = 1 };")
586    sections.append("    if (b0 & 0xE0 == 0xC0 and bytes.len >= 2)")
587    sections.append(
588        "        return .{ .value = (@as(u21, b0 & 0x1F) << 6) | (bytes[1] & 0x3F), .len = 2 };"
589    )
590    sections.append("    if (b0 & 0xF0 == 0xE0 and bytes.len >= 3)")
591    sections.append(
592        "        return .{ .value = (@as(u21, b0 & 0x0F) << 12) | (@as(u21, bytes[1] & 0x3F) << 6) | (bytes[2] & 0x3F), .len = 3 };"
593    )
594    sections.append("    if (b0 & 0xF8 == 0xF0 and bytes.len >= 4)")
595    sections.append(
596        "        return .{ .value = (@as(u21, b0 & 0x07) << 18) | (@as(u21, bytes[1] & 0x3F) << 12) | (@as(u21, bytes[2] & 0x3F) << 6) | (bytes[3] & 0x3F), .len = 4 };"
597    )
598    sections.append(
599        '    return .{ .value = 0xFFFD, .len = 1 }; // replacement char'
600    )
601    sections.append("}")
602    sections.append("")
603    sections.append("pub fn lastCodepoint(text: []const u8) ?Codepoint {")
604    sections.append("    if (text.len == 0) return null;")
605    sections.append("    var i = text.len - 1;")
606    sections.append("    while (i > 0 and text[i] & 0xC0 == 0x80) : (i -= 1) {}")
607    sections.append("    return decodeUtf8(text[i..]);")
608    sections.append("}")
609    sections.append("")
610
611    # ── range search ──
612    sections.append("// ── range search ──")
613    sections.append("")
614    sections.append("fn rangeContains(ranges: []const [2]u21, c: u21) bool {")
615    sections.append("    var lo: usize = 0;")
616    sections.append("    var hi: usize = ranges.len;")
617    sections.append("    while (lo < hi) {")
618    sections.append("        const mid = lo + (hi - lo) / 2;")
619    sections.append("        if (c > ranges[mid][1]) { lo = mid + 1; }")
620    sections.append("        else if (c < ranges[mid][0]) { hi = mid; }")
621    sections.append("        else return true;")
622    sections.append("    }")
623    sections.append("    return false;")
624    sections.append("}")
625    sections.append("")
626
627    # ── symbol class (shared by prefix, suffix, infix) ──
628    sections.append("// ── symbol class (So/Sc unicode categories) ──")
629    sections.append("")
630    sections.append(gen_range_table("isSymbol", prefix["symbol_ranges"]))
631    sections.append("")
632
633    # ── prefix data ──
634    sections.append("// ── prefix data ──")
635    sections.append("")
636    sections.append(gen_codepoint_set("isPrefixChar", prefix["single_chars"]))
637    sections.append("")
638    sections.append(
639        gen_multi_literals("prefix_multi_literals", prefix["multi_literals"])
640    )
641    sections.append("")
642    if prefix["literal_unless_digit"]:
643        cps = prefix["literal_unless_digit"]
644        sections.append(gen_codepoint_set("isPrefixUnlessDigit", cps))
645        sections.append("")
646
647    # ── suffix data ──
648    sections.append("// ── suffix data ──")
649    sections.append("")
650    sections.append(gen_codepoint_set("isSuffixChar", suffix["single_chars"]))
651    sections.append("")
652    sections.append(
653        gen_multi_literals("suffix_multi_literals", suffix["multi_literals"])
654    )
655    sections.append("")
656
657    # lookbehind helpers
658    global _class_counter, _class_cache
659    _class_counter = 0
660    _class_cache = {}
661
662    lookbehind_code, rule_descs = gen_lookbehind_rules(suffix["lookbehind_rules"])
663    if lookbehind_code.strip():
664        sections.append("// ── suffix lookbehind helpers ──")
665        sections.append("")
666        sections.append(lookbehind_code)
667
668    # generate a compact suffix lookbehind rule table
669    # each rule is: check lookbehind condition, then try matching suffix text(s)
670    sections.append("// ── suffix lookbehind rules ──")
671    sections.append("// these are checked by tokenizer.zig matchSuffix()")
672    sections.append(
673        "// format: for each rule, check behind condition then try suffix literal(s)"
674    )
675    sections.append("")
676
677    # encode rules as Zig code in a single function
678    sections.append("pub fn matchSuffixLookbehind(text: []const u8) usize {")
679    sections.append("    if (text.len < 2) return 0;")
680    sections.append("")
681
682    for ri, desc in enumerate(rule_descs):
683        behind = desc["behind"]
684        suffix_texts = desc["suffix_texts"]
685
686        # sort suffix texts longest first
687        suffix_texts_sorted = sorted(suffix_texts, key=lambda s: -len(s.encode("utf-8")))
688
689        for st in suffix_texts_sorted:
690            blen = len(st.encode("utf-8"))
691            zig_lit = zig_str(st)
692
693            sections.append(
694                f"    if (std.mem.endsWith(u8, text, {zig_lit}) and text.len > {blen}) {{"
695            )
696
697            bid = desc["behind_id"]
698            if isinstance(bid, int):
699                # simple class check
700                sections.append(
701                    f"        const before = lastCodepoint(text[0 .. text.len - {blen}]);"
702                )
703                sections.append(
704                    f"        if (before != null and matchLookbehind{bid}(before.?.value)) return {blen};"
705                )
706            elif isinstance(bid, tuple) and isinstance(bid[0], str) and bid[0] == "literal":
707                # literal check
708                cp = bid[1]
709                sections.append(
710                    f"        const before = lastCodepoint(text[0 .. text.len - {blen}]);"
711                )
712                sections.append(
713                    f"        if (before != null and before.?.value == {zig_char(cp)}) return {blen};"
714                )
715            elif isinstance(bid, tuple):
716                # sequence check (multiple lookbehinds)
717                sections.append(
718                    f"        const b1 = lastCodepoint(text[0 .. text.len - {blen}]);"
719                )
720                sections.append(f"        if (b1) |bp1| {{")
721
722                if len(bid) == 2:
723                    sections.append(
724                        f"            const b2 = lastCodepoint(text[0 .. text.len - {blen} - bp1.len]);"
725                    )
726                    # bid[0] is the class before bp2, bid[1] is the class for bp1
727                    test1 = (
728                        f"matchLookbehind{bid[1]}(bp1.value)"
729                        if isinstance(bid[1], int)
730                        else f"bp1.value == {zig_char(bid[1][1])}"
731                    )
732                    test0 = (
733                        f"matchLookbehind{bid[0]}(b2p.value)"
734                        if isinstance(bid[0], int)
735                        else f"b2p.value == {zig_char(bid[0][1])}"
736                    )
737                    sections.append(f"            if ({test1}) {{")
738                    sections.append(f"                if (b2) |b2p| {{")
739                    sections.append(
740                        f"                    if ({test0}) return {blen};"
741                    )
742                    sections.append(f"                }}")
743                    sections.append(f"            }}")
744
745                sections.append(f"        }}")
746
747            sections.append("    }")
748
749    sections.append("    return 0;")
750    sections.append("}")
751    sections.append("")
752
753    # ── infix character class tables ──
754    sections.append("// ── infix character classes ──")
755    sections.append("")
756    for label, ranges in sorted(infix_classes.items()):
757        name = f"is_{label}"
758        sections.append(gen_range_table(name, ranges))
759        sections.append("")
760
761    # ── specials ──
762    sections.append("// ── special cases ──")
763    sections.append("")
764    sections.append(gen_specials(specials))
765    sections.append("")
766
767    return "\n".join(sections)
768
769
770def main():
771    print("loading spaCy...")
772    tok = load_spacy()
773
774    print("\ngenerating zig source...")
775    zig_source = generate(tok)
776
777    out_path = Path("src/tokenizer_data.zig")
778    out_path.write_text(zig_source)
779    n_lines = zig_source.count("\n") + 1
780    print(f"\nwrote {out_path} ({len(zig_source):,} bytes, {n_lines:,} lines)")
781
782    # verification: run spaCy tokenizer on test inputs and dump expected output
783    print("\ngenerating test data...")
784    import spacy
785    nlp = spacy.load("en_core_web_sm")
786    test_sentences = [
787        "Barack Obama visited Paris.",
788        "Apple Inc. is worth $2.5 trillion.",
789        "I can't believe it's not butter!",
790        "Dr. Smith's office (room 42) is closed.",
791        "U.S.A. and U.K. are allies.",
792        "They're going to the store.",
793        'He said "hello" and left.',
794        "The cost is $500.00/month.",
795        "New York-based company",
796        "e-mail: test@example.com",
797        "10,000 people",
798        "3.14159 is pi",
799        "state-of-the-art technology",
800        "Mr. and Mrs. Jones",
801        "it's 5:30pm",
802    ]
803
804    test_data = []
805    for sent in test_sentences:
806        doc = nlp.make_doc(sent)
807        tokens = [t.text for t in doc]
808        test_data.append({"text": sent, "tokens": tokens})
809
810    test_path = Path("tests/tokenizer_expected.json")
811    test_path.parent.mkdir(exist_ok=True)
812    test_path.write_text(json.dumps(test_data, indent=2))
813    print(f"wrote {test_path} ({len(test_data)} test cases)")
814
815
816if __name__ == "__main__":
817    main()