1{
2 lib,
3 linkFarm,
4 fetchurl,
5 buildPythonPackage,
6 fetchFromGitHub,
7
8 # nativeBuildInputs
9 cargo,
10 pkg-config,
11 rustPlatform,
12 rustc,
13 setuptools-rust,
14
15 # buildInputs
16 openssl,
17
18 # dependencies
19 huggingface-hub,
20
21 # tests
22 datasets,
23 numpy,
24 pytestCheckHook,
25 requests,
26 tiktoken,
27 writableTmpDirAsHomeHook,
28}:
29
30let
31 # See https://github.com/huggingface/tokenizers/blob/main/bindings/python/tests/utils.py for details
32 # about URLs and file names
33 test-data = linkFarm "tokenizers-test-data" {
34 "roberta-base-vocab.json" = fetchurl {
35 url = "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-vocab.json";
36 hash = "sha256-nn9jwtFdZmtS4h0lDS5RO4fJtxPPpph6gu2J5eblBlU=";
37 };
38 "roberta-base-merges.txt" = fetchurl {
39 url = "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-merges.txt";
40 hash = "sha256-HOFmR3PFDz4MyIQmGak+3EYkUltyixiKngvjO3cmrcU=";
41 };
42 "albert-base-v1-tokenizer.json" = fetchurl {
43 url = "https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-v1-tokenizer.json";
44 hash = "sha256-biqj1cpMaEG8NqUCgXnLTWPBKZMfoY/OOP2zjOxNKsM=";
45 };
46 "bert-base-uncased-vocab.txt" = fetchurl {
47 url = "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt";
48 hash = "sha256-B+ztN1zsFE0nyQAkHz4zlHjeyVj5L928VR8pXJkgOKM=";
49 };
50 "tokenizer-llama3.json" = fetchurl {
51 url = "https://huggingface.co/Narsil/llama-tokenizer/resolve/main/tokenizer.json";
52 hash = "sha256-eePlImNfMXEwCRO7QhRkqH3mIiGCoFcLmyzLoqlksrQ=";
53 };
54 "big.txt" = fetchurl {
55 url = "https://norvig.com/big.txt";
56 hash = "sha256-+gZsfUDw8gGsQUTmUqpiQw5YprOAXscGUPZ42lgE6Hs=";
57 };
58 "bert-wiki.json" = fetchurl {
59 url = "https://s3.amazonaws.com/models.huggingface.co/bert/anthony/doc-pipeline/tokenizer.json";
60 hash = "sha256-i533xC8J5CDMNxBjo+p6avIM8UOcui8RmGAmK0GmfBc=";
61 };
62 "tokenizer-wiki.json" = fetchurl {
63 url = "https://s3.amazonaws.com/models.huggingface.co/bert/anthony/doc-quicktour/tokenizer.json";
64 hash = "sha256-ipY9d5DR5nxoO6kj7rItueZ9AO5wq9+Nzr6GuEIfIBI=";
65 };
66 "openai-gpt-vocab.json" = fetchurl {
67 url = "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-vocab.json";
68 hash = "sha256-/fSbGefeI2hSCR2gm4Sno81eew55kWN2z0X2uBJ7gHg=";
69 };
70 "openai-gpt-merges.txt" = fetchurl {
71 url = "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-merges.txt";
72 hash = "sha256-Dqm1GuaVBzzYceA1j3AWMR1nGn/zlj42fVI2Ui8pRyU=";
73 };
74 };
75in
76buildPythonPackage rec {
77 pname = "tokenizers";
78 version = "0.21.3";
79 pyproject = true;
80
81 src = fetchFromGitHub {
82 owner = "huggingface";
83 repo = "tokenizers";
84 tag = "v${version}";
85 hash = "sha256-8z1jgH0Nj7D+joN42AA2ORNSLvcfWiYHn4dpTq1HWB0=";
86 };
87
88 postPatch = ''
89 ln -s ${./Cargo.lock} Cargo.lock
90 '';
91 cargoDeps = rustPlatform.importCargoLock {
92 lockFile = ./Cargo.lock;
93 };
94
95 sourceRoot = "${src.name}/bindings/python";
96
97 nativeBuildInputs = [
98 cargo
99 pkg-config
100 rustPlatform.cargoSetupHook
101 rustPlatform.maturinBuildHook
102 rustc
103 setuptools-rust
104 ];
105
106 buildInputs = [
107 openssl
108 ];
109
110 dependencies = [
111 huggingface-hub
112 ];
113
114 nativeCheckInputs = [
115 datasets
116 numpy
117 pytestCheckHook
118 requests
119 tiktoken
120 writableTmpDirAsHomeHook
121 ];
122
123 postUnpack =
124 # Add data files for tests, otherwise tests attempt network access
125 ''
126 mkdir $sourceRoot/tests/data
127 ln -s ${test-data}/* $sourceRoot/tests/data/
128 '';
129
130 pythonImportsCheck = [ "tokenizers" ];
131
132 disabledTests = [
133 # Downloads data using the datasets module
134 "test_encode_special_tokens"
135 "test_splitting"
136 "TestTrainFromIterators"
137
138 # Those tests require more data
139 "test_from_pretrained"
140 "test_from_pretrained_revision"
141 "test_continuing_prefix_trainer_mistmatch"
142 ];
143
144 disabledTestPaths = [
145 # fixture 'model' not found
146 "benches/test_tiktoken.py"
147 ];
148
149 meta = {
150 description = "Fast State-of-the-Art Tokenizers optimized for Research and Production";
151 homepage = "https://github.com/huggingface/tokenizers";
152 changelog = "https://github.com/huggingface/tokenizers/releases/tag/v${version}";
153 license = lib.licenses.asl20;
154 maintainers = with lib.maintainers; [ GaetanLepage ];
155 platforms = lib.platforms.unix;
156 };
157}