1{
2 lib,
3 linkFarm,
4 fetchurl,
5 buildPythonPackage,
6 fetchFromGitHub,
7 python,
8
9 # nativeBuildInputs
10 cargo,
11 pkg-config,
12 rustPlatform,
13 rustc,
14 setuptools-rust,
15
16 # buildInputs
17 openssl,
18
19 # dependencies
20 huggingface-hub,
21
22 # tests
23 datasets,
24 numpy,
25 pytestCheckHook,
26 requests,
27 tiktoken,
28}:
29
30let
31 # See https://github.com/huggingface/tokenizers/blob/main/bindings/python/tests/utils.py for details
32 # about URLs and file names
33 test-data = linkFarm "tokenizers-test-data" {
34 "roberta-base-vocab.json" = fetchurl {
35 url = "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-vocab.json";
36 hash = "sha256-nn9jwtFdZmtS4h0lDS5RO4fJtxPPpph6gu2J5eblBlU=";
37 };
38 "roberta-base-merges.txt" = fetchurl {
39 url = "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-merges.txt";
40 hash = "sha256-HOFmR3PFDz4MyIQmGak+3EYkUltyixiKngvjO3cmrcU=";
41 };
42 "albert-base-v1-tokenizer.json" = fetchurl {
43 url = "https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-v1-tokenizer.json";
44 hash = "sha256-biqj1cpMaEG8NqUCgXnLTWPBKZMfoY/OOP2zjOxNKsM=";
45 };
46 "bert-base-uncased-vocab.txt" = fetchurl {
47 url = "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt";
48 hash = "sha256-B+ztN1zsFE0nyQAkHz4zlHjeyVj5L928VR8pXJkgOKM=";
49 };
50 "big.txt" = fetchurl {
51 url = "https://norvig.com/big.txt";
52 hash = "sha256-+gZsfUDw8gGsQUTmUqpiQw5YprOAXscGUPZ42lgE6Hs=";
53 };
54 "bert-wiki.json" = fetchurl {
55 url = "https://s3.amazonaws.com/models.huggingface.co/bert/anthony/doc-pipeline/tokenizer.json";
56 hash = "sha256-i533xC8J5CDMNxBjo+p6avIM8UOcui8RmGAmK0GmfBc=";
57 };
58 "tokenizer-wiki.json" = fetchurl {
59 url = "https://s3.amazonaws.com/models.huggingface.co/bert/anthony/doc-quicktour/tokenizer.json";
60 hash = "sha256-ipY9d5DR5nxoO6kj7rItueZ9AO5wq9+Nzr6GuEIfIBI=";
61 };
62 "openai-gpt-vocab.json" = fetchurl {
63 url = "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-vocab.json";
64 hash = "sha256-/fSbGefeI2hSCR2gm4Sno81eew55kWN2z0X2uBJ7gHg=";
65 };
66 "openai-gpt-merges.txt" = fetchurl {
67 url = "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-merges.txt";
68 hash = "sha256-Dqm1GuaVBzzYceA1j3AWMR1nGn/zlj42fVI2Ui8pRyU=";
69 };
70 };
71in
72buildPythonPackage rec {
73 pname = "tokenizers";
74 version = "0.21.0";
75 pyproject = true;
76
77 src = fetchFromGitHub {
78 owner = "huggingface";
79 repo = "tokenizers";
80 tag = "v${version}";
81 hash = "sha256-G65XiVlvJXOC9zqcVr9vWamUnpC0aa4kyYkE2v1K2iY=";
82 };
83
84 cargoDeps = rustPlatform.fetchCargoTarball {
85 inherit
86 pname
87 version
88 src
89 sourceRoot
90 ;
91 hash = "sha256-5cw63ydyhpMup2tOe/hpG2W6YZ+cvT75MJBkE5Wap4s=";
92 };
93
94 sourceRoot = "${src.name}/bindings/python";
95 maturinBuildFlags = [ "--interpreter ${python.executable}" ];
96
97 nativeBuildInputs = [
98 cargo
99 pkg-config
100 rustPlatform.cargoSetupHook
101 rustPlatform.maturinBuildHook
102 rustc
103 setuptools-rust
104 ];
105
106 buildInputs = [
107 openssl
108 ];
109
110 dependencies = [
111 huggingface-hub
112 ];
113
114 nativeCheckInputs = [
115 datasets
116 numpy
117 pytestCheckHook
118 requests
119 tiktoken
120 ];
121
122 postUnpack = ''
123 # Add data files for tests, otherwise tests attempt network access
124 mkdir $sourceRoot/tests/data
125 ln -s ${test-data}/* $sourceRoot/tests/data/
126 '';
127
128 preCheck = ''
129 export HOME=$(mktemp -d);
130 '';
131
132 pythonImportsCheck = [ "tokenizers" ];
133
134 disabledTests = [
135 # Downloads data using the datasets module
136 "test_encode_special_tokens"
137 "test_splitting"
138 "TestTrainFromIterators"
139
140 # Those tests require more data
141 "test_from_pretrained"
142 "test_from_pretrained_revision"
143 "test_continuing_prefix_trainer_mistmatch"
144 ];
145
146 disabledTestPaths = [
147 # fixture 'model' not found
148 "benches/test_tiktoken.py"
149 ];
150
151 meta = {
152 description = "Fast State-of-the-Art Tokenizers optimized for Research and Production";
153 homepage = "https://github.com/huggingface/tokenizers";
154 changelog = "https://github.com/huggingface/tokenizers/releases/tag/v${version}";
155 license = lib.licenses.asl20;
156 maintainers = with lib.maintainers; [ GaetanLepage ];
157 platforms = lib.platforms.unix;
158 };
159}