1{
2 lib,
3 linkFarm,
4 fetchurl,
5 buildPythonPackage,
6 fetchFromGitHub,
7
8 # nativeBuildInputs
9 cargo,
10 pkg-config,
11 rustPlatform,
12 rustc,
13 setuptools-rust,
14
15 # buildInputs
16 openssl,
17
18 # dependencies
19 huggingface-hub,
20
21 # tests
22 datasets,
23 numpy,
24 pytestCheckHook,
25 requests,
26 tiktoken,
27 writableTmpDirAsHomeHook,
28}:
29
30let
31 # See https://github.com/huggingface/tokenizers/blob/main/bindings/python/tests/utils.py for details
32 # about URLs and file names
33 test-data = linkFarm "tokenizers-test-data" {
34 "roberta-base-vocab.json" = fetchurl {
35 url = "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-vocab.json";
36 hash = "sha256-nn9jwtFdZmtS4h0lDS5RO4fJtxPPpph6gu2J5eblBlU=";
37 };
38 "roberta-base-merges.txt" = fetchurl {
39 url = "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-merges.txt";
40 hash = "sha256-HOFmR3PFDz4MyIQmGak+3EYkUltyixiKngvjO3cmrcU=";
41 };
42 "albert-base-v1-tokenizer.json" = fetchurl {
43 url = "https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-v1-tokenizer.json";
44 hash = "sha256-biqj1cpMaEG8NqUCgXnLTWPBKZMfoY/OOP2zjOxNKsM=";
45 };
46 "bert-base-uncased-vocab.txt" = fetchurl {
47 url = "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt";
48 hash = "sha256-B+ztN1zsFE0nyQAkHz4zlHjeyVj5L928VR8pXJkgOKM=";
49 };
50 "big.txt" = fetchurl {
51 url = "https://norvig.com/big.txt";
52 hash = "sha256-+gZsfUDw8gGsQUTmUqpiQw5YprOAXscGUPZ42lgE6Hs=";
53 };
54 "bert-wiki.json" = fetchurl {
55 url = "https://s3.amazonaws.com/models.huggingface.co/bert/anthony/doc-pipeline/tokenizer.json";
56 hash = "sha256-i533xC8J5CDMNxBjo+p6avIM8UOcui8RmGAmK0GmfBc=";
57 };
58 "tokenizer-wiki.json" = fetchurl {
59 url = "https://s3.amazonaws.com/models.huggingface.co/bert/anthony/doc-quicktour/tokenizer.json";
60 hash = "sha256-ipY9d5DR5nxoO6kj7rItueZ9AO5wq9+Nzr6GuEIfIBI=";
61 };
62 "openai-gpt-vocab.json" = fetchurl {
63 url = "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-vocab.json";
64 hash = "sha256-/fSbGefeI2hSCR2gm4Sno81eew55kWN2z0X2uBJ7gHg=";
65 };
66 "openai-gpt-merges.txt" = fetchurl {
67 url = "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-merges.txt";
68 hash = "sha256-Dqm1GuaVBzzYceA1j3AWMR1nGn/zlj42fVI2Ui8pRyU=";
69 };
70 };
71in
72buildPythonPackage rec {
73 pname = "tokenizers";
74 version = "0.21.1";
75 pyproject = true;
76
77 src = fetchFromGitHub {
78 owner = "huggingface";
79 repo = "tokenizers";
80 tag = "v${version}";
81 hash = "sha256-3S7ZCaZnnwyNjoZ4Y/q3ngQE2MIm2iyCCjYAkdMVG2A=";
82 };
83
84 # TestUnigram.test_continuing_prefix_trainer_mismatch fails with:
85 # Exception: No such file or directory (os error 2)
86 # Fix submitted upstream: https://github.com/huggingface/tokenizers/pull/1747
87 postPatch = ''
88 substituteInPlace tests/bindings/test_trainers.py \
89 --replace-fail '"data/' '"tests/data/'
90 '';
91
92 cargoDeps = rustPlatform.fetchCargoVendor {
93 inherit
94 pname
95 version
96 src
97 sourceRoot
98 ;
99 hash = "sha256-I7LlBmeVY2rWI0ta6x311iAurQKuutsClrbUgkt9xWk=";
100 };
101
102 sourceRoot = "${src.name}/bindings/python";
103
104 nativeBuildInputs = [
105 cargo
106 pkg-config
107 rustPlatform.cargoSetupHook
108 rustPlatform.maturinBuildHook
109 rustc
110 setuptools-rust
111 ];
112
113 buildInputs = [
114 openssl
115 ];
116
117 dependencies = [
118 huggingface-hub
119 ];
120
121 nativeCheckInputs = [
122 datasets
123 numpy
124 pytestCheckHook
125 requests
126 tiktoken
127 writableTmpDirAsHomeHook
128 ];
129
130 postUnpack =
131 # Add data files for tests, otherwise tests attempt network access
132 ''
133 mkdir $sourceRoot/tests/data
134 ln -s ${test-data}/* $sourceRoot/tests/data/
135 '';
136
137 pythonImportsCheck = [ "tokenizers" ];
138
139 disabledTests = [
140 # Downloads data using the datasets module
141 "test_encode_special_tokens"
142 "test_splitting"
143 "TestTrainFromIterators"
144
145 # Those tests require more data
146 "test_from_pretrained"
147 "test_from_pretrained_revision"
148 "test_continuing_prefix_trainer_mistmatch"
149 ];
150
151 disabledTestPaths = [
152 # fixture 'model' not found
153 "benches/test_tiktoken.py"
154 ];
155
156 meta = {
157 description = "Fast State-of-the-Art Tokenizers optimized for Research and Production";
158 homepage = "https://github.com/huggingface/tokenizers";
159 changelog = "https://github.com/huggingface/tokenizers/releases/tag/v${version}";
160 license = lib.licenses.asl20;
161 maintainers = with lib.maintainers; [ GaetanLepage ];
162 platforms = lib.platforms.unix;
163 };
164}