this repo has no description
at main 817 lines 28 kB view raw
1"""generate tokenizer_data.zig from spaCy's en_core_web_sm tokenizer config. 2 3extracts: 4 - unicode character class tables (sorted ranges for binary search) 5 - prefix single-char set + multi-char literals + special rules 6 - suffix data (single-char set, multi-char literals, lookbehind rules) 7 - special cases table (1347 entries) 8 9the matching LOGIC lives in tokenizer.zig. this script only generates DATA tables. 10 11usage: 12 uv run --python 3.12 --with spacy \ 13 --with 'en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl' \ 14 python scripts/gen_tokenizer_data.py 15""" 16 17import json 18import re 19import sre_parse 20import sys 21from pathlib import Path 22 23 24def load_spacy(): 25 """load spaCy and extract all tokenizer config.""" 26 import spacy 27 28 nlp = spacy.load("en_core_web_sm") 29 tok = nlp.tokenizer 30 return tok 31 32 33def extract_ranges(items): 34 """convert sre_parse IN items to sorted, merged (lo, hi) ranges.""" 35 ranges = [] 36 for op, val in items: 37 if op == sre_parse.LITERAL: 38 ranges.append((val, val)) 39 elif op == sre_parse.RANGE: 40 ranges.append(val) 41 elif op == sre_parse.CATEGORY: 42 if val == sre_parse.CATEGORY_DIGIT: 43 ranges.append((0x30, 0x39)) 44 elif val == sre_parse.CATEGORY_WORD: 45 ranges.extend([(0x30, 0x39), (0x41, 0x5A), (0x5F, 0x5F), (0x61, 0x7A)]) 46 ranges.sort() 47 merged = [] 48 for lo, hi in ranges: 49 if merged and lo <= merged[-1][1] + 1: 50 merged[-1] = (merged[-1][0], max(merged[-1][1], hi)) 51 else: 52 merged.append((lo, hi)) 53 return merged 54 55 56def class_from_in_node(in_items): 57 """extract character class from an IN node, handling NEGATE.""" 58 negated = any(x[0] == sre_parse.NEGATE for x in in_items) 59 non_neg = [x for x in in_items if x[0] != sre_parse.NEGATE] 60 ranges = extract_ranges(non_neg) 61 return ranges, negated 62 63 64# ── prefix data extraction ── 65 66 67def extract_prefix_data(tok): 68 """extract prefix pattern data: single chars, multi-char literals, char class, specials.""" 69 pat = tok.prefix_search.__self__.pattern 70 parsed = sre_parse.parse(pat) 71 branches = parsed[1][1][1] # AT_BEGINNING, BRANCH 72 73 single_chars = [] # codepoints matched as single-char prefix 74 multi_literals = [] # multi-byte string prefixes 75 symbol_ranges = [] # the big unicode symbol class 76 dots = False # whether ..+ is a prefix 77 literal_unless_digit = [] # chars like + that don't match before digits 78 79 for branch in branches: 80 if len(branch) == 1: 81 op, val = branch[0] 82 if op == sre_parse.LITERAL: 83 single_chars.append(val) 84 elif op == sre_parse.IN: 85 ranges, _ = class_from_in_node(val) 86 if len(ranges) > 50: 87 symbol_ranges = ranges 88 else: 89 # small class — expand to individual chars 90 for lo, hi in ranges: 91 for cp in range(lo, hi + 1): 92 single_chars.append(cp) 93 elif all(b[0] == sre_parse.LITERAL for b in branch): 94 s = "".join(chr(b[1]) for b in branch) 95 multi_literals.append(s) 96 elif ( 97 len(branch) == 2 98 and branch[0][0] == sre_parse.LITERAL 99 and branch[1][0] == sre_parse.MAX_REPEAT 100 ): 101 dots = True 102 elif ( 103 len(branch) == 2 104 and branch[0][0] == sre_parse.LITERAL 105 and branch[1][0] == sre_parse.ASSERT_NOT 106 ): 107 literal_unless_digit.append(branch[0][1]) 108 109 return { 110 "single_chars": sorted(set(single_chars)), 111 "multi_literals": sorted(multi_literals, key=lambda s: -len(s)), 112 "symbol_ranges": symbol_ranges, 113 "has_dots": dots, 114 "literal_unless_digit": literal_unless_digit, 115 } 116 117 118# ── suffix data extraction ── 119 120 121def extract_suffix_data(tok): 122 """extract suffix pattern data.""" 123 pat = tok.suffix_search.__self__.pattern 124 parsed = sre_parse.parse(pat) 125 branches = parsed[0][1][1] # BRANCH 126 127 single_chars = [] 128 multi_literals = [] 129 symbol_ranges = [] 130 has_dots = False 131 lookbehind_rules = [] 132 133 for branch in branches: 134 items = list(branch) 135 if items and items[-1] == (sre_parse.AT, sre_parse.AT_END): 136 items = items[:-1] 137 if not items: 138 continue 139 140 # simple literal(s) 141 if all(x[0] == sre_parse.LITERAL for x in items): 142 s = "".join(chr(x[1]) for x in items) 143 if len(s) == 1: 144 single_chars.append(ord(s)) 145 else: 146 multi_literals.append(s) 147 continue 148 149 # character class 150 if len(items) == 1 and items[0][0] == sre_parse.IN: 151 ranges, _ = class_from_in_node(items[0][1]) 152 if len(ranges) > 50: 153 symbol_ranges = ranges 154 else: 155 for lo, hi in ranges: 156 for cp in range(lo, hi + 1): 157 single_chars.append(cp) 158 continue 159 160 # dots 161 if ( 162 len(items) >= 2 163 and items[0] == (sre_parse.LITERAL, ord(".")) 164 and items[1][0] == sre_parse.MAX_REPEAT 165 ): 166 has_dots = True 167 continue 168 169 # lookbehind rule 170 if items[0][0] == sre_parse.ASSERT: 171 direction = items[0][1][0] 172 if direction == -1: # lookbehind 173 rule = _extract_lookbehind_rule(items) 174 if rule: 175 lookbehind_rules.append(rule) 176 continue 177 178 return { 179 "single_chars": sorted(set(single_chars)), 180 "multi_literals": sorted(multi_literals, key=lambda s: -len(s)), 181 "symbol_ranges": symbol_ranges, 182 "has_dots": has_dots, 183 "lookbehind_rules": lookbehind_rules, 184 } 185 186 187def _extract_lookbehind_rule(items): 188 """extract a suffix lookbehind rule into a serializable structure.""" 189 behind_content = items[0][1][1] 190 rest = items[1:] 191 192 # parse lookbehind 193 behind = _parse_assert_content(behind_content) 194 if behind is None: 195 return None 196 197 # parse suffix part 198 suffix = _parse_suffix_part(rest) 199 if suffix is None: 200 return None 201 202 return {"behind": behind, "suffix": suffix} 203 204 205def _parse_assert_content(content): 206 """parse lookbehind/lookahead content into a descriptor.""" 207 parts = [] 208 for item in content: 209 if item[0] == sre_parse.IN: 210 ranges, negated = class_from_in_node(item[1]) 211 parts.append({"type": "class", "ranges": ranges, "negated": negated}) 212 elif item[0] == sre_parse.LITERAL: 213 parts.append({"type": "literal", "char": item[1]}) 214 else: 215 return None 216 if len(parts) == 1: 217 return parts[0] 218 elif len(parts) > 1: 219 return {"type": "sequence", "parts": parts} 220 return None 221 222 223def _parse_suffix_part(items): 224 """parse the suffix portion after lookbehind.""" 225 if all(x[0] == sre_parse.LITERAL for x in items): 226 s = "".join(chr(x[1]) for x in items) 227 return {"type": "literal", "text": s} 228 229 # subpattern with alternatives 230 if len(items) == 1 and items[0][0] == sre_parse.SUBPATTERN: 231 content = items[0][1][3] 232 if content and content[0][0] == sre_parse.BRANCH: 233 alts = [] 234 for branch in content[0][1][1]: 235 if all(x[0] == sre_parse.LITERAL for x in branch): 236 alts.append("".join(chr(x[1]) for x in branch)) 237 if alts: 238 return {"type": "alternatives", "texts": alts} 239 240 # BRANCH directly 241 if len(items) == 1 and items[0][0] == sre_parse.BRANCH: 242 alts = [] 243 for branch in items[0][1][1]: 244 if all(x[0] == sre_parse.LITERAL for x in branch): 245 alts.append("".join(chr(x[1]) for x in branch)) 246 if alts: 247 return {"type": "alternatives", "texts": alts} 248 249 return None 250 251 252# ── unicode class extraction from all patterns ── 253 254 255def extract_named_classes(tok): 256 """extract the specific unicode character classes used across patterns. 257 258 we identify them by their content: 259 - symbol: the big So/Sc class (~174 ranges) 260 - lower: lowercase letters (contains a-z) 261 - upper: uppercase letters (contains A-Z) 262 - alpha: lower + upper 263 - alnum: alpha + digits 264 - lower_or_punct: the wide "not just upper" class used in suffix lookbehinds 265 """ 266 classes = {} 267 268 # extract from suffix lookbehinds 269 suffix_pat = tok.suffix_search.__self__.pattern 270 sp = sre_parse.parse(suffix_pat) 271 272 def walk_for_classes(items, label=""): 273 for item in items: 274 op = item[0] 275 if op == sre_parse.IN: 276 ranges, negated = class_from_in_node(item[1]) 277 if len(ranges) > 5: 278 _classify(ranges, classes) 279 elif op == sre_parse.BRANCH: 280 for b in item[1][1]: 281 walk_for_classes(b, label) 282 elif op in (sre_parse.ASSERT, sre_parse.ASSERT_NOT): 283 walk_for_classes(item[1][1], label) 284 elif op == sre_parse.SUBPATTERN: 285 if item[1][3]: 286 walk_for_classes(list(item[1][3]), label) 287 288 walk_for_classes(list(sp), "suffix") 289 290 # also from infix 291 infix_pat = tok.infix_finditer.__self__.pattern 292 ip = sre_parse.parse(infix_pat) 293 walk_for_classes(list(ip), "infix") 294 295 return classes 296 297 298def _classify(ranges, classes): 299 """classify a character range set by its content.""" 300 range_set = set(ranges) 301 302 # check for a-z presence → lower 303 has_az = (0x61, 0x7A) in range_set 304 has_AZ = (0x41, 0x5A) in range_set 305 has_09 = (0x30, 0x39) in range_set or (0x30, 0x39) in range_set 306 307 n_ranges = len(ranges) 308 n_cp = sum(hi - lo + 1 for lo, hi in ranges) 309 310 if has_az and not has_AZ and not has_09 and n_cp > 1000: 311 if "lower" not in classes or len(ranges) > len(classes["lower"]): 312 classes["lower"] = ranges 313 elif has_AZ and not has_az and not has_09 and n_cp > 1000: 314 if "upper" not in classes or len(ranges) > len(classes["upper"]): 315 classes["upper"] = ranges 316 elif has_az and has_AZ and not has_09 and n_cp > 1000: 317 if "alpha" not in classes or len(ranges) > len(classes["alpha"]): 318 classes["alpha"] = ranges 319 elif has_az and has_AZ and has_09 and n_cp > 1000: 320 if "alnum" not in classes or len(ranges) > len(classes["alnum"]): 321 classes["alnum"] = ranges 322 elif n_cp > 100000 and n_ranges > 300: 323 # very large class — likely "lower_or_punct" or similar 324 key = f"wide_{n_ranges}" 325 classes[key] = ranges 326 327 328# ── special cases ── 329 330 331def extract_specials(tok): 332 """extract special case rules.""" 333 entries = [] 334 for key, val in sorted(tok.rules.items()): 335 orths = [d[65] for d in val] # 65 = ORTH 336 entries.append((key, orths)) 337 return entries 338 339 340# ── zig code generation ── 341 342 343def zig_str(s): 344 """convert a python string to a zig string literal.""" 345 parts = [] 346 for c in s: 347 cp = ord(c) 348 if cp < 128: 349 if c == '"': 350 parts.append('\\"') 351 elif c == "\\": 352 parts.append("\\\\") 353 elif c == "\n": 354 parts.append("\\n") 355 elif c == "\t": 356 parts.append("\\t") 357 elif c.isprintable(): 358 parts.append(c) 359 else: 360 parts.append(f"\\x{cp:02x}") 361 else: 362 for b in c.encode("utf-8"): 363 parts.append(f"\\x{b:02x}") 364 return '"' + "".join(parts) + '"' 365 366 367def zig_char(cp): 368 """convert a codepoint to a zig u21 literal.""" 369 if 32 <= cp < 127 and chr(cp) not in "'\\\"": 370 return f"'{chr(cp)}'" 371 return f"0x{cp:04X}" 372 373 374def gen_range_table(name, ranges): 375 """generate a const range table + lookup function.""" 376 lines = [] 377 lines.append(f"pub const {name}_ranges = [_][2]u21{{") 378 for lo, hi in ranges: 379 lines.append(f" .{{ 0x{lo:04X}, 0x{hi:04X} }},") 380 lines.append("};") 381 lines.append("") 382 lines.append(f"pub fn {name}(c: u21) bool {{") 383 lines.append(f" return rangeContains(&{name}_ranges, c);") 384 lines.append("}") 385 return "\n".join(lines) 386 387 388def gen_codepoint_set(name, codepoints): 389 """generate a switch-based codepoint set.""" 390 lines = [] 391 lines.append(f"pub fn {name}(c: u21) bool {{") 392 lines.append(" return switch (c) {") 393 # group consecutive codepoints into ranges 394 ranges = [] 395 cps = sorted(set(codepoints)) 396 i = 0 397 while i < len(cps): 398 start = cps[i] 399 end = start 400 while i + 1 < len(cps) and cps[i + 1] == end + 1: 401 end = cps[i + 1] 402 i += 1 403 ranges.append((start, end)) 404 i += 1 405 406 for lo, hi in ranges: 407 if lo == hi: 408 lines.append(f" {zig_char(lo)} => true,") 409 else: 410 lines.append(f" {zig_char(lo)}...{zig_char(hi)} => true,") 411 lines.append(" else => false,") 412 lines.append(" };") 413 lines.append("}") 414 return "\n".join(lines) 415 416 417def gen_specials(entries): 418 """generate the special cases StaticStringMap.""" 419 max_tokens = max(len(orths) for _, orths in entries) 420 assert max_tokens <= 3, f"max tokens {max_tokens} > 3" 421 422 lines = [] 423 lines.append("pub const SpecialCase = struct {") 424 lines.append(" tokens: [3][]const u8,") 425 lines.append(" len: u8,") 426 lines.append("};") 427 lines.append("") 428 lines.append( 429 "pub const specials = std.StaticStringMap(SpecialCase).initComptime(.{" 430 ) 431 for key, orths in entries: 432 k = zig_str(key) 433 toks = [zig_str(o) for o in orths] 434 while len(toks) < 3: 435 toks.append('""') 436 tok_str = ", ".join(toks) 437 lines.append( 438 f" .{{ {k}, SpecialCase{{ .tokens = .{{ {tok_str} }}, .len = {len(orths)} }} }}," 439 ) 440 lines.append("});") 441 return "\n".join(lines) 442 443 444def gen_multi_literals(name, literals): 445 """generate an array of multi-char literals for matching.""" 446 lines = [] 447 lines.append(f"pub const {name} = [_][]const u8{{") 448 for lit in literals: 449 lines.append(f" {zig_str(lit)},") 450 lines.append("};") 451 return "\n".join(lines) 452 453 454def gen_lookbehind_rules(rules): 455 """generate suffix lookbehind rule data structures.""" 456 # identify unique character classes used in lookbehinds 457 class_tables = {} 458 rule_descs = [] 459 460 for rule in rules: 461 behind = rule["behind"] 462 suffix = rule["suffix"] 463 464 behind_id = _get_class_id(behind, class_tables) 465 suffix_texts = ( 466 [suffix["text"]] 467 if suffix["type"] == "literal" 468 else suffix.get("texts", []) 469 ) 470 rule_descs.append( 471 {"behind_id": behind_id, "behind": behind, "suffix_texts": suffix_texts} 472 ) 473 474 lines = [] 475 476 # generate class tables for lookbehinds 477 for cid, ranges in class_tables.items(): 478 lines.append(f"const lookbehind_class_{cid}_ranges = [_][2]u21{{") 479 for lo, hi in ranges: 480 lines.append(f" .{{ 0x{lo:04X}, 0x{hi:04X} }},") 481 lines.append("};") 482 lines.append("") 483 lines.append(f"pub fn matchLookbehind{cid}(c: u21) bool {{") 484 lines.append(f" return rangeContains(&lookbehind_class_{cid}_ranges, c);") 485 lines.append("}") 486 lines.append("") 487 488 return "\n".join(lines), rule_descs 489 490 491_class_counter = 0 492_class_cache = {} 493 494 495def _get_class_id(behind, class_tables): 496 global _class_counter 497 if behind["type"] == "class": 498 key = str(behind["ranges"]) 499 if key not in _class_cache: 500 cid = _class_counter 501 _class_counter += 1 502 _class_cache[key] = cid 503 class_tables[cid] = behind["ranges"] 504 return _class_cache[key] 505 elif behind["type"] == "sequence": 506 # sequence of tests — generate IDs for each part 507 ids = [] 508 for part in behind["parts"]: 509 ids.append(_get_class_id(part, class_tables)) 510 return tuple(ids) 511 elif behind["type"] == "literal": 512 return ("literal", behind["char"]) 513 return None 514 515 516def generate(tok): 517 """generate the complete tokenizer_data.zig.""" 518 print("extracting prefix data...") 519 prefix = extract_prefix_data(tok) 520 print( 521 f" {len(prefix['single_chars'])} single chars, " 522 f"{len(prefix['multi_literals'])} multi literals, " 523 f"{len(prefix['symbol_ranges'])} symbol ranges" 524 ) 525 526 print("extracting suffix data...") 527 suffix = extract_suffix_data(tok) 528 print( 529 f" {len(suffix['single_chars'])} single chars, " 530 f"{len(suffix['multi_literals'])} multi literals, " 531 f"{len(suffix['lookbehind_rules'])} lookbehind rules" 532 ) 533 534 print("extracting unicode classes...") 535 classes = extract_named_classes(tok) 536 print(f" classes found: {list(classes.keys())}") 537 538 print("extracting specials...") 539 specials = extract_specials(tok) 540 print(f" {len(specials)} entries") 541 542 # also extract the infix character classes directly 543 infix_pat = tok.infix_finditer.__self__.pattern 544 ip = sre_parse.parse(infix_pat) 545 infix_branches = ip[0][1][1] 546 547 # infix[2] is the symbol class (same as prefix) 548 # infix[3] lookbehind is digits, chars are +-*^, lookahead is digits+hyphen 549 # infix[4] lookbehind is lower/punct, ahead is upper/alpha 550 # infix[5] lookbehind is alpha, ahead is alpha 551 # infix[6] branch alternatives: -, --, ---, ~, en-dash, em-dash, em-dash*2 552 # infix[7] lookbehind is alnum, chars :/~<=>, ahead is alpha 553 554 # extract infix lookbehind/lookahead classes 555 infix_classes = {} 556 for idx in [3, 4, 5, 6, 7]: 557 branch = infix_branches[idx] 558 for item in branch: 559 if item[0] == sre_parse.ASSERT: 560 direction = item[1][0] 561 content = item[1][1] 562 if len(content) == 1 and content[0][0] == sre_parse.IN: 563 ranges, _ = class_from_in_node(content[0][1]) 564 label = ( 565 f"infix_{idx}_{'behind' if direction == -1 else 'ahead'}" 566 ) 567 infix_classes[label] = ranges 568 569 # build output 570 sections = [] 571 sections.append("//! generated by scripts/gen_tokenizer_data.py — do not edit.") 572 sections.append("//! tokenizer pattern data compiled from spaCy en_core_web_sm.") 573 sections.append("") 574 sections.append('const std = @import("std");') 575 sections.append("") 576 577 # ── utf-8 helpers ── 578 sections.append("// ── utf-8 helpers ──") 579 sections.append("") 580 sections.append("pub const Codepoint = struct { value: u21, len: u3 };") 581 sections.append("") 582 sections.append("pub fn decodeUtf8(bytes: []const u8) ?Codepoint {") 583 sections.append(" if (bytes.len == 0) return null;") 584 sections.append(" const b0 = bytes[0];") 585 sections.append(" if (b0 < 0x80) return .{ .value = b0, .len = 1 };") 586 sections.append(" if (b0 & 0xE0 == 0xC0 and bytes.len >= 2)") 587 sections.append( 588 " return .{ .value = (@as(u21, b0 & 0x1F) << 6) | (bytes[1] & 0x3F), .len = 2 };" 589 ) 590 sections.append(" if (b0 & 0xF0 == 0xE0 and bytes.len >= 3)") 591 sections.append( 592 " return .{ .value = (@as(u21, b0 & 0x0F) << 12) | (@as(u21, bytes[1] & 0x3F) << 6) | (bytes[2] & 0x3F), .len = 3 };" 593 ) 594 sections.append(" if (b0 & 0xF8 == 0xF0 and bytes.len >= 4)") 595 sections.append( 596 " return .{ .value = (@as(u21, b0 & 0x07) << 18) | (@as(u21, bytes[1] & 0x3F) << 12) | (@as(u21, bytes[2] & 0x3F) << 6) | (bytes[3] & 0x3F), .len = 4 };" 597 ) 598 sections.append( 599 ' return .{ .value = 0xFFFD, .len = 1 }; // replacement char' 600 ) 601 sections.append("}") 602 sections.append("") 603 sections.append("pub fn lastCodepoint(text: []const u8) ?Codepoint {") 604 sections.append(" if (text.len == 0) return null;") 605 sections.append(" var i = text.len - 1;") 606 sections.append(" while (i > 0 and text[i] & 0xC0 == 0x80) : (i -= 1) {}") 607 sections.append(" return decodeUtf8(text[i..]);") 608 sections.append("}") 609 sections.append("") 610 611 # ── range search ── 612 sections.append("// ── range search ──") 613 sections.append("") 614 sections.append("fn rangeContains(ranges: []const [2]u21, c: u21) bool {") 615 sections.append(" var lo: usize = 0;") 616 sections.append(" var hi: usize = ranges.len;") 617 sections.append(" while (lo < hi) {") 618 sections.append(" const mid = lo + (hi - lo) / 2;") 619 sections.append(" if (c > ranges[mid][1]) { lo = mid + 1; }") 620 sections.append(" else if (c < ranges[mid][0]) { hi = mid; }") 621 sections.append(" else return true;") 622 sections.append(" }") 623 sections.append(" return false;") 624 sections.append("}") 625 sections.append("") 626 627 # ── symbol class (shared by prefix, suffix, infix) ── 628 sections.append("// ── symbol class (So/Sc unicode categories) ──") 629 sections.append("") 630 sections.append(gen_range_table("isSymbol", prefix["symbol_ranges"])) 631 sections.append("") 632 633 # ── prefix data ── 634 sections.append("// ── prefix data ──") 635 sections.append("") 636 sections.append(gen_codepoint_set("isPrefixChar", prefix["single_chars"])) 637 sections.append("") 638 sections.append( 639 gen_multi_literals("prefix_multi_literals", prefix["multi_literals"]) 640 ) 641 sections.append("") 642 if prefix["literal_unless_digit"]: 643 cps = prefix["literal_unless_digit"] 644 sections.append(gen_codepoint_set("isPrefixUnlessDigit", cps)) 645 sections.append("") 646 647 # ── suffix data ── 648 sections.append("// ── suffix data ──") 649 sections.append("") 650 sections.append(gen_codepoint_set("isSuffixChar", suffix["single_chars"])) 651 sections.append("") 652 sections.append( 653 gen_multi_literals("suffix_multi_literals", suffix["multi_literals"]) 654 ) 655 sections.append("") 656 657 # lookbehind helpers 658 global _class_counter, _class_cache 659 _class_counter = 0 660 _class_cache = {} 661 662 lookbehind_code, rule_descs = gen_lookbehind_rules(suffix["lookbehind_rules"]) 663 if lookbehind_code.strip(): 664 sections.append("// ── suffix lookbehind helpers ──") 665 sections.append("") 666 sections.append(lookbehind_code) 667 668 # generate a compact suffix lookbehind rule table 669 # each rule is: check lookbehind condition, then try matching suffix text(s) 670 sections.append("// ── suffix lookbehind rules ──") 671 sections.append("// these are checked by tokenizer.zig matchSuffix()") 672 sections.append( 673 "// format: for each rule, check behind condition then try suffix literal(s)" 674 ) 675 sections.append("") 676 677 # encode rules as Zig code in a single function 678 sections.append("pub fn matchSuffixLookbehind(text: []const u8) usize {") 679 sections.append(" if (text.len < 2) return 0;") 680 sections.append("") 681 682 for ri, desc in enumerate(rule_descs): 683 behind = desc["behind"] 684 suffix_texts = desc["suffix_texts"] 685 686 # sort suffix texts longest first 687 suffix_texts_sorted = sorted(suffix_texts, key=lambda s: -len(s.encode("utf-8"))) 688 689 for st in suffix_texts_sorted: 690 blen = len(st.encode("utf-8")) 691 zig_lit = zig_str(st) 692 693 sections.append( 694 f" if (std.mem.endsWith(u8, text, {zig_lit}) and text.len > {blen}) {{" 695 ) 696 697 bid = desc["behind_id"] 698 if isinstance(bid, int): 699 # simple class check 700 sections.append( 701 f" const before = lastCodepoint(text[0 .. text.len - {blen}]);" 702 ) 703 sections.append( 704 f" if (before != null and matchLookbehind{bid}(before.?.value)) return {blen};" 705 ) 706 elif isinstance(bid, tuple) and isinstance(bid[0], str) and bid[0] == "literal": 707 # literal check 708 cp = bid[1] 709 sections.append( 710 f" const before = lastCodepoint(text[0 .. text.len - {blen}]);" 711 ) 712 sections.append( 713 f" if (before != null and before.?.value == {zig_char(cp)}) return {blen};" 714 ) 715 elif isinstance(bid, tuple): 716 # sequence check (multiple lookbehinds) 717 sections.append( 718 f" const b1 = lastCodepoint(text[0 .. text.len - {blen}]);" 719 ) 720 sections.append(f" if (b1) |bp1| {{") 721 722 if len(bid) == 2: 723 sections.append( 724 f" const b2 = lastCodepoint(text[0 .. text.len - {blen} - bp1.len]);" 725 ) 726 # bid[0] is the class before bp2, bid[1] is the class for bp1 727 test1 = ( 728 f"matchLookbehind{bid[1]}(bp1.value)" 729 if isinstance(bid[1], int) 730 else f"bp1.value == {zig_char(bid[1][1])}" 731 ) 732 test0 = ( 733 f"matchLookbehind{bid[0]}(b2p.value)" 734 if isinstance(bid[0], int) 735 else f"b2p.value == {zig_char(bid[0][1])}" 736 ) 737 sections.append(f" if ({test1}) {{") 738 sections.append(f" if (b2) |b2p| {{") 739 sections.append( 740 f" if ({test0}) return {blen};" 741 ) 742 sections.append(f" }}") 743 sections.append(f" }}") 744 745 sections.append(f" }}") 746 747 sections.append(" }") 748 749 sections.append(" return 0;") 750 sections.append("}") 751 sections.append("") 752 753 # ── infix character class tables ── 754 sections.append("// ── infix character classes ──") 755 sections.append("") 756 for label, ranges in sorted(infix_classes.items()): 757 name = f"is_{label}" 758 sections.append(gen_range_table(name, ranges)) 759 sections.append("") 760 761 # ── specials ── 762 sections.append("// ── special cases ──") 763 sections.append("") 764 sections.append(gen_specials(specials)) 765 sections.append("") 766 767 return "\n".join(sections) 768 769 770def main(): 771 print("loading spaCy...") 772 tok = load_spacy() 773 774 print("\ngenerating zig source...") 775 zig_source = generate(tok) 776 777 out_path = Path("src/tokenizer_data.zig") 778 out_path.write_text(zig_source) 779 n_lines = zig_source.count("\n") + 1 780 print(f"\nwrote {out_path} ({len(zig_source):,} bytes, {n_lines:,} lines)") 781 782 # verification: run spaCy tokenizer on test inputs and dump expected output 783 print("\ngenerating test data...") 784 import spacy 785 nlp = spacy.load("en_core_web_sm") 786 test_sentences = [ 787 "Barack Obama visited Paris.", 788 "Apple Inc. is worth $2.5 trillion.", 789 "I can't believe it's not butter!", 790 "Dr. Smith's office (room 42) is closed.", 791 "U.S.A. and U.K. are allies.", 792 "They're going to the store.", 793 'He said "hello" and left.', 794 "The cost is $500.00/month.", 795 "New York-based company", 796 "e-mail: test@example.com", 797 "10,000 people", 798 "3.14159 is pi", 799 "state-of-the-art technology", 800 "Mr. and Mrs. Jones", 801 "it's 5:30pm", 802 ] 803 804 test_data = [] 805 for sent in test_sentences: 806 doc = nlp.make_doc(sent) 807 tokens = [t.text for t in doc] 808 test_data.append({"text": sent, "tokens": tokens}) 809 810 test_path = Path("tests/tokenizer_expected.json") 811 test_path.parent.mkdir(exist_ok=True) 812 test_path.write_text(json.dumps(test_data, indent=2)) 813 print(f"wrote {test_path} ({len(test_data)} test cases)") 814 815 816if __name__ == "__main__": 817 main()