1{
2 lib,
3 linkFarm,
4 fetchurl,
5 buildPythonPackage,
6 fetchFromGitHub,
7
8 # nativeBuildInputs
9 cargo,
10 pkg-config,
11 rustPlatform,
12 rustc,
13 setuptools-rust,
14
15 # buildInputs
16 openssl,
17
18 # dependencies
19 huggingface-hub,
20
21 # tests
22 datasets,
23 numpy,
24 pytestCheckHook,
25 requests,
26 tiktoken,
27 writableTmpDirAsHomeHook,
28}:
29
30let
31 # See https://github.com/huggingface/tokenizers/blob/main/bindings/python/tests/utils.py for details
32 # about URLs and file names
33 test-data = linkFarm "tokenizers-test-data" {
34 "roberta-base-vocab.json" = fetchurl {
35 url = "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-vocab.json";
36 hash = "sha256-nn9jwtFdZmtS4h0lDS5RO4fJtxPPpph6gu2J5eblBlU=";
37 };
38 "roberta-base-merges.txt" = fetchurl {
39 url = "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-merges.txt";
40 hash = "sha256-HOFmR3PFDz4MyIQmGak+3EYkUltyixiKngvjO3cmrcU=";
41 };
42 "albert-base-v1-tokenizer.json" = fetchurl {
43 url = "https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-v1-tokenizer.json";
44 hash = "sha256-biqj1cpMaEG8NqUCgXnLTWPBKZMfoY/OOP2zjOxNKsM=";
45 };
46 "bert-base-uncased-vocab.txt" = fetchurl {
47 url = "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt";
48 hash = "sha256-B+ztN1zsFE0nyQAkHz4zlHjeyVj5L928VR8pXJkgOKM=";
49 };
50 "tokenizer-llama3.json" = fetchurl {
51 url = "https://huggingface.co/Narsil/llama-tokenizer/resolve/main/tokenizer.json";
52 hash = "sha256-eePlImNfMXEwCRO7QhRkqH3mIiGCoFcLmyzLoqlksrQ=";
53 };
54 "big.txt" = fetchurl {
55 url = "https://norvig.com/big.txt";
56 hash = "sha256-+gZsfUDw8gGsQUTmUqpiQw5YprOAXscGUPZ42lgE6Hs=";
57 };
58 "bert-wiki.json" = fetchurl {
59 url = "https://s3.amazonaws.com/models.huggingface.co/bert/anthony/doc-pipeline/tokenizer.json";
60 hash = "sha256-i533xC8J5CDMNxBjo+p6avIM8UOcui8RmGAmK0GmfBc=";
61 };
62 "tokenizer-wiki.json" = fetchurl {
63 url = "https://s3.amazonaws.com/models.huggingface.co/bert/anthony/doc-quicktour/tokenizer.json";
64 hash = "sha256-ipY9d5DR5nxoO6kj7rItueZ9AO5wq9+Nzr6GuEIfIBI=";
65 };
66 "openai-gpt-vocab.json" = fetchurl {
67 url = "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-vocab.json";
68 hash = "sha256-/fSbGefeI2hSCR2gm4Sno81eew55kWN2z0X2uBJ7gHg=";
69 };
70 "openai-gpt-merges.txt" = fetchurl {
71 url = "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-merges.txt";
72 hash = "sha256-Dqm1GuaVBzzYceA1j3AWMR1nGn/zlj42fVI2Ui8pRyU=";
73 };
74 };
75in
76buildPythonPackage rec {
77 pname = "tokenizers";
78 version = "0.21.4";
79 pyproject = true;
80
81 src = fetchFromGitHub {
82 owner = "huggingface";
83 repo = "tokenizers";
84 tag = "v${version}";
85 hash = "sha256-HJUycrNDpy2FOYi6aZ76orLewZCuLC1MoJ57peYJqvI=";
86 };
87
88 cargoDeps = rustPlatform.fetchCargoVendor {
89 inherit
90 pname
91 version
92 src
93 sourceRoot
94 ;
95 hash = "sha256-0olujhOOO/BAH4JvnmXd1kE7T/sp5Vr3Z3P2X2jhZKs=";
96 };
97
98 sourceRoot = "${src.name}/bindings/python";
99
100 nativeBuildInputs = [
101 cargo
102 pkg-config
103 rustPlatform.cargoSetupHook
104 rustPlatform.maturinBuildHook
105 rustc
106 setuptools-rust
107 ];
108
109 buildInputs = [
110 openssl
111 ];
112
113 dependencies = [
114 huggingface-hub
115 ];
116
117 nativeCheckInputs = [
118 datasets
119 numpy
120 pytestCheckHook
121 requests
122 tiktoken
123 writableTmpDirAsHomeHook
124 ];
125
126 postUnpack =
127 # Add data files for tests, otherwise tests attempt network access
128 ''
129 mkdir $sourceRoot/tests/data
130 ln -s ${test-data}/* $sourceRoot/tests/data/
131 '';
132
133 pythonImportsCheck = [ "tokenizers" ];
134
135 disabledTests = [
136 # Downloads data using the datasets module
137 "test_encode_special_tokens"
138 "test_splitting"
139 "TestTrainFromIterators"
140
141 # Those tests require more data
142 "test_from_pretrained"
143 "test_from_pretrained_revision"
144 "test_continuing_prefix_trainer_mistmatch"
145 ];
146
147 disabledTestPaths = [
148 # fixture 'model' not found
149 "benches/test_tiktoken.py"
150 ];
151
152 meta = {
153 description = "Fast State-of-the-Art Tokenizers optimized for Research and Production";
154 homepage = "https://github.com/huggingface/tokenizers";
155 changelog = "https://github.com/huggingface/tokenizers/releases/tag/v${version}";
156 license = lib.licenses.asl20;
157 maintainers = with lib.maintainers; [ GaetanLepage ];
158 platforms = lib.platforms.unix;
159 };
160}