this repo has no description
1"""generate tokenizer_data.zig from spaCy's en_core_web_sm tokenizer config.
2
3extracts:
4 - unicode character class tables (sorted ranges for binary search)
5 - prefix single-char set + multi-char literals + special rules
6 - suffix data (single-char set, multi-char literals, lookbehind rules)
7 - special cases table (1347 entries)
8
9the matching LOGIC lives in tokenizer.zig. this script only generates DATA tables.
10
11usage:
12 uv run --python 3.12 --with spacy \
13 --with 'en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl' \
14 python scripts/gen_tokenizer_data.py
15"""
16
17import json
18import re
19import sre_parse
20import sys
21from pathlib import Path
22
23
24def load_spacy():
25 """load spaCy and extract all tokenizer config."""
26 import spacy
27
28 nlp = spacy.load("en_core_web_sm")
29 tok = nlp.tokenizer
30 return tok
31
32
33def extract_ranges(items):
34 """convert sre_parse IN items to sorted, merged (lo, hi) ranges."""
35 ranges = []
36 for op, val in items:
37 if op == sre_parse.LITERAL:
38 ranges.append((val, val))
39 elif op == sre_parse.RANGE:
40 ranges.append(val)
41 elif op == sre_parse.CATEGORY:
42 if val == sre_parse.CATEGORY_DIGIT:
43 ranges.append((0x30, 0x39))
44 elif val == sre_parse.CATEGORY_WORD:
45 ranges.extend([(0x30, 0x39), (0x41, 0x5A), (0x5F, 0x5F), (0x61, 0x7A)])
46 ranges.sort()
47 merged = []
48 for lo, hi in ranges:
49 if merged and lo <= merged[-1][1] + 1:
50 merged[-1] = (merged[-1][0], max(merged[-1][1], hi))
51 else:
52 merged.append((lo, hi))
53 return merged
54
55
56def class_from_in_node(in_items):
57 """extract character class from an IN node, handling NEGATE."""
58 negated = any(x[0] == sre_parse.NEGATE for x in in_items)
59 non_neg = [x for x in in_items if x[0] != sre_parse.NEGATE]
60 ranges = extract_ranges(non_neg)
61 return ranges, negated
62
63
64# ── prefix data extraction ──
65
66
67def extract_prefix_data(tok):
68 """extract prefix pattern data: single chars, multi-char literals, char class, specials."""
69 pat = tok.prefix_search.__self__.pattern
70 parsed = sre_parse.parse(pat)
71 branches = parsed[1][1][1] # AT_BEGINNING, BRANCH
72
73 single_chars = [] # codepoints matched as single-char prefix
74 multi_literals = [] # multi-byte string prefixes
75 symbol_ranges = [] # the big unicode symbol class
76 dots = False # whether ..+ is a prefix
77 literal_unless_digit = [] # chars like + that don't match before digits
78
79 for branch in branches:
80 if len(branch) == 1:
81 op, val = branch[0]
82 if op == sre_parse.LITERAL:
83 single_chars.append(val)
84 elif op == sre_parse.IN:
85 ranges, _ = class_from_in_node(val)
86 if len(ranges) > 50:
87 symbol_ranges = ranges
88 else:
89 # small class — expand to individual chars
90 for lo, hi in ranges:
91 for cp in range(lo, hi + 1):
92 single_chars.append(cp)
93 elif all(b[0] == sre_parse.LITERAL for b in branch):
94 s = "".join(chr(b[1]) for b in branch)
95 multi_literals.append(s)
96 elif (
97 len(branch) == 2
98 and branch[0][0] == sre_parse.LITERAL
99 and branch[1][0] == sre_parse.MAX_REPEAT
100 ):
101 dots = True
102 elif (
103 len(branch) == 2
104 and branch[0][0] == sre_parse.LITERAL
105 and branch[1][0] == sre_parse.ASSERT_NOT
106 ):
107 literal_unless_digit.append(branch[0][1])
108
109 return {
110 "single_chars": sorted(set(single_chars)),
111 "multi_literals": sorted(multi_literals, key=lambda s: -len(s)),
112 "symbol_ranges": symbol_ranges,
113 "has_dots": dots,
114 "literal_unless_digit": literal_unless_digit,
115 }
116
117
118# ── suffix data extraction ──
119
120
121def extract_suffix_data(tok):
122 """extract suffix pattern data."""
123 pat = tok.suffix_search.__self__.pattern
124 parsed = sre_parse.parse(pat)
125 branches = parsed[0][1][1] # BRANCH
126
127 single_chars = []
128 multi_literals = []
129 symbol_ranges = []
130 has_dots = False
131 lookbehind_rules = []
132
133 for branch in branches:
134 items = list(branch)
135 if items and items[-1] == (sre_parse.AT, sre_parse.AT_END):
136 items = items[:-1]
137 if not items:
138 continue
139
140 # simple literal(s)
141 if all(x[0] == sre_parse.LITERAL for x in items):
142 s = "".join(chr(x[1]) for x in items)
143 if len(s) == 1:
144 single_chars.append(ord(s))
145 else:
146 multi_literals.append(s)
147 continue
148
149 # character class
150 if len(items) == 1 and items[0][0] == sre_parse.IN:
151 ranges, _ = class_from_in_node(items[0][1])
152 if len(ranges) > 50:
153 symbol_ranges = ranges
154 else:
155 for lo, hi in ranges:
156 for cp in range(lo, hi + 1):
157 single_chars.append(cp)
158 continue
159
160 # dots
161 if (
162 len(items) >= 2
163 and items[0] == (sre_parse.LITERAL, ord("."))
164 and items[1][0] == sre_parse.MAX_REPEAT
165 ):
166 has_dots = True
167 continue
168
169 # lookbehind rule
170 if items[0][0] == sre_parse.ASSERT:
171 direction = items[0][1][0]
172 if direction == -1: # lookbehind
173 rule = _extract_lookbehind_rule(items)
174 if rule:
175 lookbehind_rules.append(rule)
176 continue
177
178 return {
179 "single_chars": sorted(set(single_chars)),
180 "multi_literals": sorted(multi_literals, key=lambda s: -len(s)),
181 "symbol_ranges": symbol_ranges,
182 "has_dots": has_dots,
183 "lookbehind_rules": lookbehind_rules,
184 }
185
186
187def _extract_lookbehind_rule(items):
188 """extract a suffix lookbehind rule into a serializable structure."""
189 behind_content = items[0][1][1]
190 rest = items[1:]
191
192 # parse lookbehind
193 behind = _parse_assert_content(behind_content)
194 if behind is None:
195 return None
196
197 # parse suffix part
198 suffix = _parse_suffix_part(rest)
199 if suffix is None:
200 return None
201
202 return {"behind": behind, "suffix": suffix}
203
204
205def _parse_assert_content(content):
206 """parse lookbehind/lookahead content into a descriptor."""
207 parts = []
208 for item in content:
209 if item[0] == sre_parse.IN:
210 ranges, negated = class_from_in_node(item[1])
211 parts.append({"type": "class", "ranges": ranges, "negated": negated})
212 elif item[0] == sre_parse.LITERAL:
213 parts.append({"type": "literal", "char": item[1]})
214 else:
215 return None
216 if len(parts) == 1:
217 return parts[0]
218 elif len(parts) > 1:
219 return {"type": "sequence", "parts": parts}
220 return None
221
222
223def _parse_suffix_part(items):
224 """parse the suffix portion after lookbehind."""
225 if all(x[0] == sre_parse.LITERAL for x in items):
226 s = "".join(chr(x[1]) for x in items)
227 return {"type": "literal", "text": s}
228
229 # subpattern with alternatives
230 if len(items) == 1 and items[0][0] == sre_parse.SUBPATTERN:
231 content = items[0][1][3]
232 if content and content[0][0] == sre_parse.BRANCH:
233 alts = []
234 for branch in content[0][1][1]:
235 if all(x[0] == sre_parse.LITERAL for x in branch):
236 alts.append("".join(chr(x[1]) for x in branch))
237 if alts:
238 return {"type": "alternatives", "texts": alts}
239
240 # BRANCH directly
241 if len(items) == 1 and items[0][0] == sre_parse.BRANCH:
242 alts = []
243 for branch in items[0][1][1]:
244 if all(x[0] == sre_parse.LITERAL for x in branch):
245 alts.append("".join(chr(x[1]) for x in branch))
246 if alts:
247 return {"type": "alternatives", "texts": alts}
248
249 return None
250
251
252# ── unicode class extraction from all patterns ──
253
254
255def extract_named_classes(tok):
256 """extract the specific unicode character classes used across patterns.
257
258 we identify them by their content:
259 - symbol: the big So/Sc class (~174 ranges)
260 - lower: lowercase letters (contains a-z)
261 - upper: uppercase letters (contains A-Z)
262 - alpha: lower + upper
263 - alnum: alpha + digits
264 - lower_or_punct: the wide "not just upper" class used in suffix lookbehinds
265 """
266 classes = {}
267
268 # extract from suffix lookbehinds
269 suffix_pat = tok.suffix_search.__self__.pattern
270 sp = sre_parse.parse(suffix_pat)
271
272 def walk_for_classes(items, label=""):
273 for item in items:
274 op = item[0]
275 if op == sre_parse.IN:
276 ranges, negated = class_from_in_node(item[1])
277 if len(ranges) > 5:
278 _classify(ranges, classes)
279 elif op == sre_parse.BRANCH:
280 for b in item[1][1]:
281 walk_for_classes(b, label)
282 elif op in (sre_parse.ASSERT, sre_parse.ASSERT_NOT):
283 walk_for_classes(item[1][1], label)
284 elif op == sre_parse.SUBPATTERN:
285 if item[1][3]:
286 walk_for_classes(list(item[1][3]), label)
287
288 walk_for_classes(list(sp), "suffix")
289
290 # also from infix
291 infix_pat = tok.infix_finditer.__self__.pattern
292 ip = sre_parse.parse(infix_pat)
293 walk_for_classes(list(ip), "infix")
294
295 return classes
296
297
298def _classify(ranges, classes):
299 """classify a character range set by its content."""
300 range_set = set(ranges)
301
302 # check for a-z presence → lower
303 has_az = (0x61, 0x7A) in range_set
304 has_AZ = (0x41, 0x5A) in range_set
305 has_09 = (0x30, 0x39) in range_set or (0x30, 0x39) in range_set
306
307 n_ranges = len(ranges)
308 n_cp = sum(hi - lo + 1 for lo, hi in ranges)
309
310 if has_az and not has_AZ and not has_09 and n_cp > 1000:
311 if "lower" not in classes or len(ranges) > len(classes["lower"]):
312 classes["lower"] = ranges
313 elif has_AZ and not has_az and not has_09 and n_cp > 1000:
314 if "upper" not in classes or len(ranges) > len(classes["upper"]):
315 classes["upper"] = ranges
316 elif has_az and has_AZ and not has_09 and n_cp > 1000:
317 if "alpha" not in classes or len(ranges) > len(classes["alpha"]):
318 classes["alpha"] = ranges
319 elif has_az and has_AZ and has_09 and n_cp > 1000:
320 if "alnum" not in classes or len(ranges) > len(classes["alnum"]):
321 classes["alnum"] = ranges
322 elif n_cp > 100000 and n_ranges > 300:
323 # very large class — likely "lower_or_punct" or similar
324 key = f"wide_{n_ranges}"
325 classes[key] = ranges
326
327
328# ── special cases ──
329
330
331def extract_specials(tok):
332 """extract special case rules."""
333 entries = []
334 for key, val in sorted(tok.rules.items()):
335 orths = [d[65] for d in val] # 65 = ORTH
336 entries.append((key, orths))
337 return entries
338
339
340# ── zig code generation ──
341
342
343def zig_str(s):
344 """convert a python string to a zig string literal."""
345 parts = []
346 for c in s:
347 cp = ord(c)
348 if cp < 128:
349 if c == '"':
350 parts.append('\\"')
351 elif c == "\\":
352 parts.append("\\\\")
353 elif c == "\n":
354 parts.append("\\n")
355 elif c == "\t":
356 parts.append("\\t")
357 elif c.isprintable():
358 parts.append(c)
359 else:
360 parts.append(f"\\x{cp:02x}")
361 else:
362 for b in c.encode("utf-8"):
363 parts.append(f"\\x{b:02x}")
364 return '"' + "".join(parts) + '"'
365
366
367def zig_char(cp):
368 """convert a codepoint to a zig u21 literal."""
369 if 32 <= cp < 127 and chr(cp) not in "'\\\"":
370 return f"'{chr(cp)}'"
371 return f"0x{cp:04X}"
372
373
374def gen_range_table(name, ranges):
375 """generate a const range table + lookup function."""
376 lines = []
377 lines.append(f"pub const {name}_ranges = [_][2]u21{{")
378 for lo, hi in ranges:
379 lines.append(f" .{{ 0x{lo:04X}, 0x{hi:04X} }},")
380 lines.append("};")
381 lines.append("")
382 lines.append(f"pub fn {name}(c: u21) bool {{")
383 lines.append(f" return rangeContains(&{name}_ranges, c);")
384 lines.append("}")
385 return "\n".join(lines)
386
387
388def gen_codepoint_set(name, codepoints):
389 """generate a switch-based codepoint set."""
390 lines = []
391 lines.append(f"pub fn {name}(c: u21) bool {{")
392 lines.append(" return switch (c) {")
393 # group consecutive codepoints into ranges
394 ranges = []
395 cps = sorted(set(codepoints))
396 i = 0
397 while i < len(cps):
398 start = cps[i]
399 end = start
400 while i + 1 < len(cps) and cps[i + 1] == end + 1:
401 end = cps[i + 1]
402 i += 1
403 ranges.append((start, end))
404 i += 1
405
406 for lo, hi in ranges:
407 if lo == hi:
408 lines.append(f" {zig_char(lo)} => true,")
409 else:
410 lines.append(f" {zig_char(lo)}...{zig_char(hi)} => true,")
411 lines.append(" else => false,")
412 lines.append(" };")
413 lines.append("}")
414 return "\n".join(lines)
415
416
417def gen_specials(entries):
418 """generate the special cases StaticStringMap."""
419 max_tokens = max(len(orths) for _, orths in entries)
420 assert max_tokens <= 3, f"max tokens {max_tokens} > 3"
421
422 lines = []
423 lines.append("pub const SpecialCase = struct {")
424 lines.append(" tokens: [3][]const u8,")
425 lines.append(" len: u8,")
426 lines.append("};")
427 lines.append("")
428 lines.append(
429 "pub const specials = std.StaticStringMap(SpecialCase).initComptime(.{"
430 )
431 for key, orths in entries:
432 k = zig_str(key)
433 toks = [zig_str(o) for o in orths]
434 while len(toks) < 3:
435 toks.append('""')
436 tok_str = ", ".join(toks)
437 lines.append(
438 f" .{{ {k}, SpecialCase{{ .tokens = .{{ {tok_str} }}, .len = {len(orths)} }} }},"
439 )
440 lines.append("});")
441 return "\n".join(lines)
442
443
444def gen_multi_literals(name, literals):
445 """generate an array of multi-char literals for matching."""
446 lines = []
447 lines.append(f"pub const {name} = [_][]const u8{{")
448 for lit in literals:
449 lines.append(f" {zig_str(lit)},")
450 lines.append("};")
451 return "\n".join(lines)
452
453
454def gen_lookbehind_rules(rules):
455 """generate suffix lookbehind rule data structures."""
456 # identify unique character classes used in lookbehinds
457 class_tables = {}
458 rule_descs = []
459
460 for rule in rules:
461 behind = rule["behind"]
462 suffix = rule["suffix"]
463
464 behind_id = _get_class_id(behind, class_tables)
465 suffix_texts = (
466 [suffix["text"]]
467 if suffix["type"] == "literal"
468 else suffix.get("texts", [])
469 )
470 rule_descs.append(
471 {"behind_id": behind_id, "behind": behind, "suffix_texts": suffix_texts}
472 )
473
474 lines = []
475
476 # generate class tables for lookbehinds
477 for cid, ranges in class_tables.items():
478 lines.append(f"const lookbehind_class_{cid}_ranges = [_][2]u21{{")
479 for lo, hi in ranges:
480 lines.append(f" .{{ 0x{lo:04X}, 0x{hi:04X} }},")
481 lines.append("};")
482 lines.append("")
483 lines.append(f"pub fn matchLookbehind{cid}(c: u21) bool {{")
484 lines.append(f" return rangeContains(&lookbehind_class_{cid}_ranges, c);")
485 lines.append("}")
486 lines.append("")
487
488 return "\n".join(lines), rule_descs
489
490
491_class_counter = 0
492_class_cache = {}
493
494
495def _get_class_id(behind, class_tables):
496 global _class_counter
497 if behind["type"] == "class":
498 key = str(behind["ranges"])
499 if key not in _class_cache:
500 cid = _class_counter
501 _class_counter += 1
502 _class_cache[key] = cid
503 class_tables[cid] = behind["ranges"]
504 return _class_cache[key]
505 elif behind["type"] == "sequence":
506 # sequence of tests — generate IDs for each part
507 ids = []
508 for part in behind["parts"]:
509 ids.append(_get_class_id(part, class_tables))
510 return tuple(ids)
511 elif behind["type"] == "literal":
512 return ("literal", behind["char"])
513 return None
514
515
516def generate(tok):
517 """generate the complete tokenizer_data.zig."""
518 print("extracting prefix data...")
519 prefix = extract_prefix_data(tok)
520 print(
521 f" {len(prefix['single_chars'])} single chars, "
522 f"{len(prefix['multi_literals'])} multi literals, "
523 f"{len(prefix['symbol_ranges'])} symbol ranges"
524 )
525
526 print("extracting suffix data...")
527 suffix = extract_suffix_data(tok)
528 print(
529 f" {len(suffix['single_chars'])} single chars, "
530 f"{len(suffix['multi_literals'])} multi literals, "
531 f"{len(suffix['lookbehind_rules'])} lookbehind rules"
532 )
533
534 print("extracting unicode classes...")
535 classes = extract_named_classes(tok)
536 print(f" classes found: {list(classes.keys())}")
537
538 print("extracting specials...")
539 specials = extract_specials(tok)
540 print(f" {len(specials)} entries")
541
542 # also extract the infix character classes directly
543 infix_pat = tok.infix_finditer.__self__.pattern
544 ip = sre_parse.parse(infix_pat)
545 infix_branches = ip[0][1][1]
546
547 # infix[2] is the symbol class (same as prefix)
548 # infix[3] lookbehind is digits, chars are +-*^, lookahead is digits+hyphen
549 # infix[4] lookbehind is lower/punct, ahead is upper/alpha
550 # infix[5] lookbehind is alpha, ahead is alpha
551 # infix[6] branch alternatives: -, --, ---, ~, en-dash, em-dash, em-dash*2
552 # infix[7] lookbehind is alnum, chars :/~<=>, ahead is alpha
553
554 # extract infix lookbehind/lookahead classes
555 infix_classes = {}
556 for idx in [3, 4, 5, 6, 7]:
557 branch = infix_branches[idx]
558 for item in branch:
559 if item[0] == sre_parse.ASSERT:
560 direction = item[1][0]
561 content = item[1][1]
562 if len(content) == 1 and content[0][0] == sre_parse.IN:
563 ranges, _ = class_from_in_node(content[0][1])
564 label = (
565 f"infix_{idx}_{'behind' if direction == -1 else 'ahead'}"
566 )
567 infix_classes[label] = ranges
568
569 # build output
570 sections = []
571 sections.append("//! generated by scripts/gen_tokenizer_data.py — do not edit.")
572 sections.append("//! tokenizer pattern data compiled from spaCy en_core_web_sm.")
573 sections.append("")
574 sections.append('const std = @import("std");')
575 sections.append("")
576
577 # ── utf-8 helpers ──
578 sections.append("// ── utf-8 helpers ──")
579 sections.append("")
580 sections.append("pub const Codepoint = struct { value: u21, len: u3 };")
581 sections.append("")
582 sections.append("pub fn decodeUtf8(bytes: []const u8) ?Codepoint {")
583 sections.append(" if (bytes.len == 0) return null;")
584 sections.append(" const b0 = bytes[0];")
585 sections.append(" if (b0 < 0x80) return .{ .value = b0, .len = 1 };")
586 sections.append(" if (b0 & 0xE0 == 0xC0 and bytes.len >= 2)")
587 sections.append(
588 " return .{ .value = (@as(u21, b0 & 0x1F) << 6) | (bytes[1] & 0x3F), .len = 2 };"
589 )
590 sections.append(" if (b0 & 0xF0 == 0xE0 and bytes.len >= 3)")
591 sections.append(
592 " return .{ .value = (@as(u21, b0 & 0x0F) << 12) | (@as(u21, bytes[1] & 0x3F) << 6) | (bytes[2] & 0x3F), .len = 3 };"
593 )
594 sections.append(" if (b0 & 0xF8 == 0xF0 and bytes.len >= 4)")
595 sections.append(
596 " return .{ .value = (@as(u21, b0 & 0x07) << 18) | (@as(u21, bytes[1] & 0x3F) << 12) | (@as(u21, bytes[2] & 0x3F) << 6) | (bytes[3] & 0x3F), .len = 4 };"
597 )
598 sections.append(
599 ' return .{ .value = 0xFFFD, .len = 1 }; // replacement char'
600 )
601 sections.append("}")
602 sections.append("")
603 sections.append("pub fn lastCodepoint(text: []const u8) ?Codepoint {")
604 sections.append(" if (text.len == 0) return null;")
605 sections.append(" var i = text.len - 1;")
606 sections.append(" while (i > 0 and text[i] & 0xC0 == 0x80) : (i -= 1) {}")
607 sections.append(" return decodeUtf8(text[i..]);")
608 sections.append("}")
609 sections.append("")
610
611 # ── range search ──
612 sections.append("// ── range search ──")
613 sections.append("")
614 sections.append("fn rangeContains(ranges: []const [2]u21, c: u21) bool {")
615 sections.append(" var lo: usize = 0;")
616 sections.append(" var hi: usize = ranges.len;")
617 sections.append(" while (lo < hi) {")
618 sections.append(" const mid = lo + (hi - lo) / 2;")
619 sections.append(" if (c > ranges[mid][1]) { lo = mid + 1; }")
620 sections.append(" else if (c < ranges[mid][0]) { hi = mid; }")
621 sections.append(" else return true;")
622 sections.append(" }")
623 sections.append(" return false;")
624 sections.append("}")
625 sections.append("")
626
627 # ── symbol class (shared by prefix, suffix, infix) ──
628 sections.append("// ── symbol class (So/Sc unicode categories) ──")
629 sections.append("")
630 sections.append(gen_range_table("isSymbol", prefix["symbol_ranges"]))
631 sections.append("")
632
633 # ── prefix data ──
634 sections.append("// ── prefix data ──")
635 sections.append("")
636 sections.append(gen_codepoint_set("isPrefixChar", prefix["single_chars"]))
637 sections.append("")
638 sections.append(
639 gen_multi_literals("prefix_multi_literals", prefix["multi_literals"])
640 )
641 sections.append("")
642 if prefix["literal_unless_digit"]:
643 cps = prefix["literal_unless_digit"]
644 sections.append(gen_codepoint_set("isPrefixUnlessDigit", cps))
645 sections.append("")
646
647 # ── suffix data ──
648 sections.append("// ── suffix data ──")
649 sections.append("")
650 sections.append(gen_codepoint_set("isSuffixChar", suffix["single_chars"]))
651 sections.append("")
652 sections.append(
653 gen_multi_literals("suffix_multi_literals", suffix["multi_literals"])
654 )
655 sections.append("")
656
657 # lookbehind helpers
658 global _class_counter, _class_cache
659 _class_counter = 0
660 _class_cache = {}
661
662 lookbehind_code, rule_descs = gen_lookbehind_rules(suffix["lookbehind_rules"])
663 if lookbehind_code.strip():
664 sections.append("// ── suffix lookbehind helpers ──")
665 sections.append("")
666 sections.append(lookbehind_code)
667
668 # generate a compact suffix lookbehind rule table
669 # each rule is: check lookbehind condition, then try matching suffix text(s)
670 sections.append("// ── suffix lookbehind rules ──")
671 sections.append("// these are checked by tokenizer.zig matchSuffix()")
672 sections.append(
673 "// format: for each rule, check behind condition then try suffix literal(s)"
674 )
675 sections.append("")
676
677 # encode rules as Zig code in a single function
678 sections.append("pub fn matchSuffixLookbehind(text: []const u8) usize {")
679 sections.append(" if (text.len < 2) return 0;")
680 sections.append("")
681
682 for ri, desc in enumerate(rule_descs):
683 behind = desc["behind"]
684 suffix_texts = desc["suffix_texts"]
685
686 # sort suffix texts longest first
687 suffix_texts_sorted = sorted(suffix_texts, key=lambda s: -len(s.encode("utf-8")))
688
689 for st in suffix_texts_sorted:
690 blen = len(st.encode("utf-8"))
691 zig_lit = zig_str(st)
692
693 sections.append(
694 f" if (std.mem.endsWith(u8, text, {zig_lit}) and text.len > {blen}) {{"
695 )
696
697 bid = desc["behind_id"]
698 if isinstance(bid, int):
699 # simple class check
700 sections.append(
701 f" const before = lastCodepoint(text[0 .. text.len - {blen}]);"
702 )
703 sections.append(
704 f" if (before != null and matchLookbehind{bid}(before.?.value)) return {blen};"
705 )
706 elif isinstance(bid, tuple) and isinstance(bid[0], str) and bid[0] == "literal":
707 # literal check
708 cp = bid[1]
709 sections.append(
710 f" const before = lastCodepoint(text[0 .. text.len - {blen}]);"
711 )
712 sections.append(
713 f" if (before != null and before.?.value == {zig_char(cp)}) return {blen};"
714 )
715 elif isinstance(bid, tuple):
716 # sequence check (multiple lookbehinds)
717 sections.append(
718 f" const b1 = lastCodepoint(text[0 .. text.len - {blen}]);"
719 )
720 sections.append(f" if (b1) |bp1| {{")
721
722 if len(bid) == 2:
723 sections.append(
724 f" const b2 = lastCodepoint(text[0 .. text.len - {blen} - bp1.len]);"
725 )
726 # bid[0] is the class before bp2, bid[1] is the class for bp1
727 test1 = (
728 f"matchLookbehind{bid[1]}(bp1.value)"
729 if isinstance(bid[1], int)
730 else f"bp1.value == {zig_char(bid[1][1])}"
731 )
732 test0 = (
733 f"matchLookbehind{bid[0]}(b2p.value)"
734 if isinstance(bid[0], int)
735 else f"b2p.value == {zig_char(bid[0][1])}"
736 )
737 sections.append(f" if ({test1}) {{")
738 sections.append(f" if (b2) |b2p| {{")
739 sections.append(
740 f" if ({test0}) return {blen};"
741 )
742 sections.append(f" }}")
743 sections.append(f" }}")
744
745 sections.append(f" }}")
746
747 sections.append(" }")
748
749 sections.append(" return 0;")
750 sections.append("}")
751 sections.append("")
752
753 # ── infix character class tables ──
754 sections.append("// ── infix character classes ──")
755 sections.append("")
756 for label, ranges in sorted(infix_classes.items()):
757 name = f"is_{label}"
758 sections.append(gen_range_table(name, ranges))
759 sections.append("")
760
761 # ── specials ──
762 sections.append("// ── special cases ──")
763 sections.append("")
764 sections.append(gen_specials(specials))
765 sections.append("")
766
767 return "\n".join(sections)
768
769
770def main():
771 print("loading spaCy...")
772 tok = load_spacy()
773
774 print("\ngenerating zig source...")
775 zig_source = generate(tok)
776
777 out_path = Path("src/tokenizer_data.zig")
778 out_path.write_text(zig_source)
779 n_lines = zig_source.count("\n") + 1
780 print(f"\nwrote {out_path} ({len(zig_source):,} bytes, {n_lines:,} lines)")
781
782 # verification: run spaCy tokenizer on test inputs and dump expected output
783 print("\ngenerating test data...")
784 import spacy
785 nlp = spacy.load("en_core_web_sm")
786 test_sentences = [
787 "Barack Obama visited Paris.",
788 "Apple Inc. is worth $2.5 trillion.",
789 "I can't believe it's not butter!",
790 "Dr. Smith's office (room 42) is closed.",
791 "U.S.A. and U.K. are allies.",
792 "They're going to the store.",
793 'He said "hello" and left.',
794 "The cost is $500.00/month.",
795 "New York-based company",
796 "e-mail: test@example.com",
797 "10,000 people",
798 "3.14159 is pi",
799 "state-of-the-art technology",
800 "Mr. and Mrs. Jones",
801 "it's 5:30pm",
802 ]
803
804 test_data = []
805 for sent in test_sentences:
806 doc = nlp.make_doc(sent)
807 tokens = [t.text for t in doc]
808 test_data.append({"text": sent, "tokens": tokens})
809
810 test_path = Path("tests/tokenizer_expected.json")
811 test_path.parent.mkdir(exist_ok=True)
812 test_path.write_text(json.dumps(test_data, indent=2))
813 print(f"wrote {test_path} ({len(test_data)} test cases)")
814
815
816if __name__ == "__main__":
817 main()