#!/usr/bin/env python3
"""Convert HuggingFace tokenizer.json to compact binary format (.tkn).

Binary format (all little-endian):
  Header (32 bytes):
    magic:            4 bytes  "TOKN"
    vocab_size:       u32      max_id + 1 (total slots)
    entry_count:      u32      populated vocab entries
    merge_count:      u32      BPE merge rules
    eos_id:           u32
    im_start_id:      u32
    im_end_id:        u32
    string_pool_size: u32

  string_pool:        [string_pool_size] u8

  vocab_table:        [vocab_size] × (offset:u32, length:u32)
                      offset=0xFFFFFFFF means empty slot

  token_to_id_table:  [entry_count] × (offset:u32, length:u32, id:u32)
                      sorted by string for binary search

  merges_table:       [merge_count] × (first:u32, second:u32, result:u32, rank:u32)
"""

import json
import struct
import sys
from pathlib import Path

EMPTY = 0xFFFFFFFF


def main():
    if len(sys.argv) < 2:
        print(f"Usage: {sys.argv[0]} <tokenizer.json> [output.tkn]")
        sys.exit(1)

    in_path = Path(sys.argv[1])
    out_path = Path(sys.argv[2]) if len(sys.argv) > 2 else in_path.with_suffix(".tkn")

    with open(in_path) as f:
        data = json.load(f)

    model = data["model"]
    vocab = model["vocab"]       # str -> int
    merges = model["merges"]     # list of [str, str] or "str str"

    # Special tokens
    added = {t["content"]: t["id"] for t in data.get("added_tokens", [])}
    eos_id = added.get("<|endoftext|>", 248046)
    im_start_id = added.get("<|im_start|>", 248045)
    im_end_id = added.get("<|im_end|>", 248046)

    # Merge added tokens into vocab
    all_vocab = dict(vocab)
    for content, tid in added.items():
        all_vocab[content] = tid

    max_id = max(all_vocab.values())
    vocab_size = max_id + 1
    entry_count = len(all_vocab)

    # Build string pool
    pool = bytearray()
    pool_index = {}  # text -> (offset, length)
    for text in sorted(all_vocab.keys()):
        offset = len(pool)
        encoded = text.encode("utf-8")
        pool.extend(encoded)
        pool_index[text] = (offset, len(encoded))

    # Vocab table: id -> (offset, length)
    vocab_table = bytearray(vocab_size * 8)
    for i in range(vocab_size):
        struct.pack_into("<II", vocab_table, i * 8, EMPTY, 0)
    for text, tid in all_vocab.items():
        off, ln = pool_index[text]
        struct.pack_into("<II", vocab_table, tid * 8, off, ln)

    # Token-to-id table: sorted by text for binary search
    sorted_entries = sorted(all_vocab.items(), key=lambda x: x[0].encode("utf-8"))
    token_to_id_table = bytearray(entry_count * 12)
    for i, (text, tid) in enumerate(sorted_entries):
        off, ln = pool_index[text]
        struct.pack_into("<III", token_to_id_table, i * 12, off, ln, tid)

    # Build text->id lookup for merge resolution
    text_to_id = {text: tid for text, tid in all_vocab.items()}

    # Merges table
    merge_entries = []
    for rank, m in enumerate(merges):
        if isinstance(m, list):
            first_s, second_s = m[0], m[1]
        else:
            parts = m.split(" ", 1)
            if len(parts) != 2:
                continue
            first_s, second_s = parts

        first_id = text_to_id.get(first_s)
        second_id = text_to_id.get(second_s)
        result_id = text_to_id.get(first_s + second_s)
        if first_id is None or second_id is None or result_id is None:
            continue
        merge_entries.append((first_id, second_id, result_id, rank))

    merge_count = len(merge_entries)
    merges_table = bytearray(merge_count * 16)
    for i, (f, s, r, rk) in enumerate(merge_entries):
        struct.pack_into("<IIII", merges_table, i * 16, f, s, r, rk)

    # Write binary
    header = struct.pack(
        "<4sIIIIIII",
        b"TOKN",
        vocab_size,
        entry_count,
        merge_count,
        eos_id,
        im_start_id,
        im_end_id,
        len(pool),
    )

    with open(out_path, "wb") as f:
        f.write(header)
        f.write(pool)
        f.write(vocab_table)
        f.write(token_to_id_table)
        f.write(merges_table)

    total = len(header) + len(pool) + len(vocab_table) + len(token_to_id_table) + len(merges_table)
    print(f"Wrote {out_path} ({total / 1048576:.1f} MB)")
    print(f"  vocab_size={vocab_size}  entries={entry_count}  merges={merge_count}")
    print(f"  pool={len(pool)} bytes  vocab_table={len(vocab_table)} bytes")


if __name__ == "__main__":
    main()