scripts/convert_tokenizer.py at main · metaend.eth.xyz/fantasma

Anonymize your writing style. Zig WASM engine detects authorship markers, fine-tuned LLM rewrites to remove them. Runs entirely in-browser. fantasma.qstorage.quilibrium.com/
wasm privacy qwen zig
fork atom
fantasma / scripts / convert_tokenizer.py
at main 141 lines 4.5 kB view raw
wrap content
chris feat: wire browser deployment with binary tokenizer format 3w ago
65711379
  1#!/usr/bin/env python3
  2"""Convert HuggingFace tokenizer.json to compact binary format (.tkn).
  3
  4Binary format (all little-endian):
  5  Header (32 bytes):
  6    magic:            4 bytes  "TOKN"
  7    vocab_size:       u32      max_id + 1 (total slots)
  8    entry_count:      u32      populated vocab entries
  9    merge_count:      u32      BPE merge rules
 10    eos_id:           u32
 11    im_start_id:      u32
 12    im_end_id:        u32
 13    string_pool_size: u32
 14
 15  string_pool:        [string_pool_size] u8
 16
 17  vocab_table:        [vocab_size] × (offset:u32, length:u32)
 18                      offset=0xFFFFFFFF means empty slot
 19
 20  token_to_id_table:  [entry_count] × (offset:u32, length:u32, id:u32)
 21                      sorted by string for binary search
 22
 23  merges_table:       [merge_count] × (first:u32, second:u32, result:u32, rank:u32)
 24"""
 25
 26import json
 27import struct
 28import sys
 29from pathlib import Path
 30
 31EMPTY = 0xFFFFFFFF
 32
 33
 34def main():
 35    if len(sys.argv) < 2:
 36        print(f"Usage: {sys.argv[0]} <tokenizer.json> [output.tkn]")
 37        sys.exit(1)
 38
 39    in_path = Path(sys.argv[1])
 40    out_path = Path(sys.argv[2]) if len(sys.argv) > 2 else in_path.with_suffix(".tkn")
 41
 42    with open(in_path) as f:
 43        data = json.load(f)
 44
 45    model = data["model"]
 46    vocab = model["vocab"]       # str -> int
 47    merges = model["merges"]     # list of [str, str] or "str str"
 48
 49    # Special tokens
 50    added = {t["content"]: t["id"] for t in data.get("added_tokens", [])}
 51    eos_id = added.get("<|endoftext|>", 248046)
 52    im_start_id = added.get("<|im_start|>", 248045)
 53    im_end_id = added.get("<|im_end|>", 248046)
 54
 55    # Merge added tokens into vocab
 56    all_vocab = dict(vocab)
 57    for content, tid in added.items():
 58        all_vocab[content] = tid
 59
 60    max_id = max(all_vocab.values())
 61    vocab_size = max_id + 1
 62    entry_count = len(all_vocab)
 63
 64    # Build string pool
 65    pool = bytearray()
 66    pool_index = {}  # text -> (offset, length)
 67    for text in sorted(all_vocab.keys()):
 68        offset = len(pool)
 69        encoded = text.encode("utf-8")
 70        pool.extend(encoded)
 71        pool_index[text] = (offset, len(encoded))
 72
 73    # Vocab table: id -> (offset, length)
 74    vocab_table = bytearray(vocab_size * 8)
 75    for i in range(vocab_size):
 76        struct.pack_into("<II", vocab_table, i * 8, EMPTY, 0)
 77    for text, tid in all_vocab.items():
 78        off, ln = pool_index[text]
 79        struct.pack_into("<II", vocab_table, tid * 8, off, ln)
 80
 81    # Token-to-id table: sorted by text for binary search
 82    sorted_entries = sorted(all_vocab.items(), key=lambda x: x[0].encode("utf-8"))
 83    token_to_id_table = bytearray(entry_count * 12)
 84    for i, (text, tid) in enumerate(sorted_entries):
 85        off, ln = pool_index[text]
 86        struct.pack_into("<III", token_to_id_table, i * 12, off, ln, tid)
 87
 88    # Build text->id lookup for merge resolution
 89    text_to_id = {text: tid for text, tid in all_vocab.items()}
 90
 91    # Merges table
 92    merge_entries = []
 93    for rank, m in enumerate(merges):
 94        if isinstance(m, list):
 95            first_s, second_s = m[0], m[1]
 96        else:
 97            parts = m.split(" ", 1)
 98            if len(parts) != 2:
 99                continue
100            first_s, second_s = parts
101
102        first_id = text_to_id.get(first_s)
103        second_id = text_to_id.get(second_s)
104        result_id = text_to_id.get(first_s + second_s)
105        if first_id is None or second_id is None or result_id is None:
106            continue
107        merge_entries.append((first_id, second_id, result_id, rank))
108
109    merge_count = len(merge_entries)
110    merges_table = bytearray(merge_count * 16)
111    for i, (f, s, r, rk) in enumerate(merge_entries):
112        struct.pack_into("<IIII", merges_table, i * 16, f, s, r, rk)
113
114    # Write binary
115    header = struct.pack(
116        "<4sIIIIIII",
117        b"TOKN",
118        vocab_size,
119        entry_count,
120        merge_count,
121        eos_id,
122        im_start_id,
123        im_end_id,
124        len(pool),
125    )
126
127    with open(out_path, "wb") as f:
128        f.write(header)
129        f.write(pool)
130        f.write(vocab_table)
131        f.write(token_to_id_table)
132        f.write(merges_table)
133
134    total = len(header) + len(pool) + len(vocab_table) + len(token_to_id_table) + len(merges_table)
135    print(f"Wrote {out_path} ({total / 1048576:.1f} MB)")
136    print(f"  vocab_size={vocab_size}  entries={entry_count}  merges={merge_count}")
137    print(f"  pool={len(pool)} bytes  vocab_table={len(vocab_table)} bytes")
138
139
140if __name__ == "__main__":
141    main()