Anonymize your writing style. Zig WASM engine detects authorship markers, fine-tuned LLM rewrites to remove them. Runs entirely in-browser.
fantasma.qstorage.quilibrium.com/
wasm
privacy
qwen
zig
1#!/usr/bin/env python3
2"""Convert HuggingFace tokenizer.json to compact binary format (.tkn).
3
4Binary format (all little-endian):
5 Header (32 bytes):
6 magic: 4 bytes "TOKN"
7 vocab_size: u32 max_id + 1 (total slots)
8 entry_count: u32 populated vocab entries
9 merge_count: u32 BPE merge rules
10 eos_id: u32
11 im_start_id: u32
12 im_end_id: u32
13 string_pool_size: u32
14
15 string_pool: [string_pool_size] u8
16
17 vocab_table: [vocab_size] × (offset:u32, length:u32)
18 offset=0xFFFFFFFF means empty slot
19
20 token_to_id_table: [entry_count] × (offset:u32, length:u32, id:u32)
21 sorted by string for binary search
22
23 merges_table: [merge_count] × (first:u32, second:u32, result:u32, rank:u32)
24"""
25
26import json
27import struct
28import sys
29from pathlib import Path
30
31EMPTY = 0xFFFFFFFF
32
33
34def main():
35 if len(sys.argv) < 2:
36 print(f"Usage: {sys.argv[0]} <tokenizer.json> [output.tkn]")
37 sys.exit(1)
38
39 in_path = Path(sys.argv[1])
40 out_path = Path(sys.argv[2]) if len(sys.argv) > 2 else in_path.with_suffix(".tkn")
41
42 with open(in_path) as f:
43 data = json.load(f)
44
45 model = data["model"]
46 vocab = model["vocab"] # str -> int
47 merges = model["merges"] # list of [str, str] or "str str"
48
49 # Special tokens
50 added = {t["content"]: t["id"] for t in data.get("added_tokens", [])}
51 eos_id = added.get("<|endoftext|>", 248046)
52 im_start_id = added.get("<|im_start|>", 248045)
53 im_end_id = added.get("<|im_end|>", 248046)
54
55 # Merge added tokens into vocab
56 all_vocab = dict(vocab)
57 for content, tid in added.items():
58 all_vocab[content] = tid
59
60 max_id = max(all_vocab.values())
61 vocab_size = max_id + 1
62 entry_count = len(all_vocab)
63
64 # Build string pool
65 pool = bytearray()
66 pool_index = {} # text -> (offset, length)
67 for text in sorted(all_vocab.keys()):
68 offset = len(pool)
69 encoded = text.encode("utf-8")
70 pool.extend(encoded)
71 pool_index[text] = (offset, len(encoded))
72
73 # Vocab table: id -> (offset, length)
74 vocab_table = bytearray(vocab_size * 8)
75 for i in range(vocab_size):
76 struct.pack_into("<II", vocab_table, i * 8, EMPTY, 0)
77 for text, tid in all_vocab.items():
78 off, ln = pool_index[text]
79 struct.pack_into("<II", vocab_table, tid * 8, off, ln)
80
81 # Token-to-id table: sorted by text for binary search
82 sorted_entries = sorted(all_vocab.items(), key=lambda x: x[0].encode("utf-8"))
83 token_to_id_table = bytearray(entry_count * 12)
84 for i, (text, tid) in enumerate(sorted_entries):
85 off, ln = pool_index[text]
86 struct.pack_into("<III", token_to_id_table, i * 12, off, ln, tid)
87
88 # Build text->id lookup for merge resolution
89 text_to_id = {text: tid for text, tid in all_vocab.items()}
90
91 # Merges table
92 merge_entries = []
93 for rank, m in enumerate(merges):
94 if isinstance(m, list):
95 first_s, second_s = m[0], m[1]
96 else:
97 parts = m.split(" ", 1)
98 if len(parts) != 2:
99 continue
100 first_s, second_s = parts
101
102 first_id = text_to_id.get(first_s)
103 second_id = text_to_id.get(second_s)
104 result_id = text_to_id.get(first_s + second_s)
105 if first_id is None or second_id is None or result_id is None:
106 continue
107 merge_entries.append((first_id, second_id, result_id, rank))
108
109 merge_count = len(merge_entries)
110 merges_table = bytearray(merge_count * 16)
111 for i, (f, s, r, rk) in enumerate(merge_entries):
112 struct.pack_into("<IIII", merges_table, i * 16, f, s, r, rk)
113
114 # Write binary
115 header = struct.pack(
116 "<4sIIIIIII",
117 b"TOKN",
118 vocab_size,
119 entry_count,
120 merge_count,
121 eos_id,
122 im_start_id,
123 im_end_id,
124 len(pool),
125 )
126
127 with open(out_path, "wb") as f:
128 f.write(header)
129 f.write(pool)
130 f.write(vocab_table)
131 f.write(token_to_id_table)
132 f.write(merges_table)
133
134 total = len(header) + len(pool) + len(vocab_table) + len(token_to_id_table) + len(merges_table)
135 print(f"Wrote {out_path} ({total / 1048576:.1f} MB)")
136 print(f" vocab_size={vocab_size} entries={entry_count} merges={merge_count}")
137 print(f" pool={len(pool)} bytes vocab_table={len(vocab_table)} bytes")
138
139
140if __name__ == "__main__":
141 main()