Anonymize your writing style. Zig WASM engine detects authorship markers, fine-tuned LLM rewrites to remove them. Runs entirely in-browser. fantasma.qstorage.quilibrium.com/
wasm privacy qwen zig
at main 141 lines 4.5 kB view raw
1#!/usr/bin/env python3 2"""Convert HuggingFace tokenizer.json to compact binary format (.tkn). 3 4Binary format (all little-endian): 5 Header (32 bytes): 6 magic: 4 bytes "TOKN" 7 vocab_size: u32 max_id + 1 (total slots) 8 entry_count: u32 populated vocab entries 9 merge_count: u32 BPE merge rules 10 eos_id: u32 11 im_start_id: u32 12 im_end_id: u32 13 string_pool_size: u32 14 15 string_pool: [string_pool_size] u8 16 17 vocab_table: [vocab_size] × (offset:u32, length:u32) 18 offset=0xFFFFFFFF means empty slot 19 20 token_to_id_table: [entry_count] × (offset:u32, length:u32, id:u32) 21 sorted by string for binary search 22 23 merges_table: [merge_count] × (first:u32, second:u32, result:u32, rank:u32) 24""" 25 26import json 27import struct 28import sys 29from pathlib import Path 30 31EMPTY = 0xFFFFFFFF 32 33 34def main(): 35 if len(sys.argv) < 2: 36 print(f"Usage: {sys.argv[0]} <tokenizer.json> [output.tkn]") 37 sys.exit(1) 38 39 in_path = Path(sys.argv[1]) 40 out_path = Path(sys.argv[2]) if len(sys.argv) > 2 else in_path.with_suffix(".tkn") 41 42 with open(in_path) as f: 43 data = json.load(f) 44 45 model = data["model"] 46 vocab = model["vocab"] # str -> int 47 merges = model["merges"] # list of [str, str] or "str str" 48 49 # Special tokens 50 added = {t["content"]: t["id"] for t in data.get("added_tokens", [])} 51 eos_id = added.get("<|endoftext|>", 248046) 52 im_start_id = added.get("<|im_start|>", 248045) 53 im_end_id = added.get("<|im_end|>", 248046) 54 55 # Merge added tokens into vocab 56 all_vocab = dict(vocab) 57 for content, tid in added.items(): 58 all_vocab[content] = tid 59 60 max_id = max(all_vocab.values()) 61 vocab_size = max_id + 1 62 entry_count = len(all_vocab) 63 64 # Build string pool 65 pool = bytearray() 66 pool_index = {} # text -> (offset, length) 67 for text in sorted(all_vocab.keys()): 68 offset = len(pool) 69 encoded = text.encode("utf-8") 70 pool.extend(encoded) 71 pool_index[text] = (offset, len(encoded)) 72 73 # Vocab table: id -> (offset, length) 74 vocab_table = bytearray(vocab_size * 8) 75 for i in range(vocab_size): 76 struct.pack_into("<II", vocab_table, i * 8, EMPTY, 0) 77 for text, tid in all_vocab.items(): 78 off, ln = pool_index[text] 79 struct.pack_into("<II", vocab_table, tid * 8, off, ln) 80 81 # Token-to-id table: sorted by text for binary search 82 sorted_entries = sorted(all_vocab.items(), key=lambda x: x[0].encode("utf-8")) 83 token_to_id_table = bytearray(entry_count * 12) 84 for i, (text, tid) in enumerate(sorted_entries): 85 off, ln = pool_index[text] 86 struct.pack_into("<III", token_to_id_table, i * 12, off, ln, tid) 87 88 # Build text->id lookup for merge resolution 89 text_to_id = {text: tid for text, tid in all_vocab.items()} 90 91 # Merges table 92 merge_entries = [] 93 for rank, m in enumerate(merges): 94 if isinstance(m, list): 95 first_s, second_s = m[0], m[1] 96 else: 97 parts = m.split(" ", 1) 98 if len(parts) != 2: 99 continue 100 first_s, second_s = parts 101 102 first_id = text_to_id.get(first_s) 103 second_id = text_to_id.get(second_s) 104 result_id = text_to_id.get(first_s + second_s) 105 if first_id is None or second_id is None or result_id is None: 106 continue 107 merge_entries.append((first_id, second_id, result_id, rank)) 108 109 merge_count = len(merge_entries) 110 merges_table = bytearray(merge_count * 16) 111 for i, (f, s, r, rk) in enumerate(merge_entries): 112 struct.pack_into("<IIII", merges_table, i * 16, f, s, r, rk) 113 114 # Write binary 115 header = struct.pack( 116 "<4sIIIIIII", 117 b"TOKN", 118 vocab_size, 119 entry_count, 120 merge_count, 121 eos_id, 122 im_start_id, 123 im_end_id, 124 len(pool), 125 ) 126 127 with open(out_path, "wb") as f: 128 f.write(header) 129 f.write(pool) 130 f.write(vocab_table) 131 f.write(token_to_id_table) 132 f.write(merges_table) 133 134 total = len(header) + len(pool) + len(vocab_table) + len(token_to_id_table) + len(merges_table) 135 print(f"Wrote {out_path} ({total / 1048576:.1f} MB)") 136 print(f" vocab_size={vocab_size} entries={entry_count} merges={merge_count}") 137 print(f" pool={len(pool)} bytes vocab_table={len(vocab_table)} bytes") 138 139 140if __name__ == "__main__": 141 main()