#!/usr/bin/env python3 """Convert HuggingFace tokenizer.json to compact binary format (.tkn). Binary format (all little-endian): Header (32 bytes): magic: 4 bytes "TOKN" vocab_size: u32 max_id + 1 (total slots) entry_count: u32 populated vocab entries merge_count: u32 BPE merge rules eos_id: u32 im_start_id: u32 im_end_id: u32 string_pool_size: u32 string_pool: [string_pool_size] u8 vocab_table: [vocab_size] × (offset:u32, length:u32) offset=0xFFFFFFFF means empty slot token_to_id_table: [entry_count] × (offset:u32, length:u32, id:u32) sorted by string for binary search merges_table: [merge_count] × (first:u32, second:u32, result:u32, rank:u32) """ import json import struct import sys from pathlib import Path EMPTY = 0xFFFFFFFF def main(): if len(sys.argv) < 2: print(f"Usage: {sys.argv[0]} [output.tkn]") sys.exit(1) in_path = Path(sys.argv[1]) out_path = Path(sys.argv[2]) if len(sys.argv) > 2 else in_path.with_suffix(".tkn") with open(in_path) as f: data = json.load(f) model = data["model"] vocab = model["vocab"] # str -> int merges = model["merges"] # list of [str, str] or "str str" # Special tokens added = {t["content"]: t["id"] for t in data.get("added_tokens", [])} eos_id = added.get("<|endoftext|>", 248046) im_start_id = added.get("<|im_start|>", 248045) im_end_id = added.get("<|im_end|>", 248046) # Merge added tokens into vocab all_vocab = dict(vocab) for content, tid in added.items(): all_vocab[content] = tid max_id = max(all_vocab.values()) vocab_size = max_id + 1 entry_count = len(all_vocab) # Build string pool pool = bytearray() pool_index = {} # text -> (offset, length) for text in sorted(all_vocab.keys()): offset = len(pool) encoded = text.encode("utf-8") pool.extend(encoded) pool_index[text] = (offset, len(encoded)) # Vocab table: id -> (offset, length) vocab_table = bytearray(vocab_size * 8) for i in range(vocab_size): struct.pack_into("id lookup for merge resolution text_to_id = {text: tid for text, tid in all_vocab.items()} # Merges table merge_entries = [] for rank, m in enumerate(merges): if isinstance(m, list): first_s, second_s = m[0], m[1] else: parts = m.split(" ", 1) if len(parts) != 2: continue first_s, second_s = parts first_id = text_to_id.get(first_s) second_id = text_to_id.get(second_s) result_id = text_to_id.get(first_s + second_s) if first_id is None or second_id is None or result_id is None: continue merge_entries.append((first_id, second_id, result_id, rank)) merge_count = len(merge_entries) merges_table = bytearray(merge_count * 16) for i, (f, s, r, rk) in enumerate(merge_entries): struct.pack_into("