1{
2 lib,
3 stdenv,
4 linkFarm,
5 buildPythonPackage,
6 cargo,
7 datasets,
8 huggingface-hub,
9 fetchFromGitHub,
10 fetchurl,
11 libiconv,
12 numpy,
13 openssl,
14 pkg-config,
15 pytestCheckHook,
16 python,
17 pythonOlder,
18 requests,
19 rustPlatform,
20 rustc,
21 Security,
22 setuptools-rust,
23}:
24
25let
26 # See https://github.com/huggingface/tokenizers/blob/main/bindings/python/tests/utils.py for details
27 # about URLs and file names
28 test-data = linkFarm "tokenizers-test-data" {
29 "roberta-base-vocab.json" = fetchurl {
30 url = "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-vocab.json";
31 sha256 = "0m86wpkfb2gdh9x9i9ng2fvwk1rva4p0s98xw996nrjxs7166zwy";
32 };
33 "roberta-base-merges.txt" = fetchurl {
34 url = "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-merges.txt";
35 sha256 = "1idd4rvkpqqbks51i2vjbd928inw7slij9l4r063w3y5fd3ndq8w";
36 };
37 "albert-base-v1-tokenizer.json" = fetchurl {
38 url = "https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-v1-tokenizer.json";
39 sha256 = "1hra9pn8rczx7378z88zjclw2qsdrdwq20m56sy42s2crbas6akf";
40 };
41 "bert-base-uncased-vocab.txt" = fetchurl {
42 url = "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt";
43 sha256 = "18rq42cmqa8zanydsbzrb34xwy4l6cz1y900r4kls57cbhvyvv07";
44 };
45 "big.txt" = fetchurl {
46 url = "https://norvig.com/big.txt";
47 sha256 = "0yz80icdly7na03cfpl0nfk5h3j3cam55rj486n03wph81ynq1ps";
48 };
49 "bert-wiki.json" = fetchurl {
50 url = "https://s3.amazonaws.com/models.huggingface.co/bert/anthony/doc-pipeline/tokenizer.json";
51 hash = "sha256-i533xC8J5CDMNxBjo+p6avIM8UOcui8RmGAmK0GmfBc=";
52 };
53 "tokenizer-wiki.json" = fetchurl {
54 url = "https://s3.amazonaws.com/models.huggingface.co/bert/anthony/doc-quicktour/tokenizer.json";
55 hash = "sha256-ipY9d5DR5nxoO6kj7rItueZ9AO5wq9+Nzr6GuEIfIBI=";
56 };
57 "openai-gpt-vocab.json" = fetchurl {
58 url = "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-vocab.json";
59 sha256 = "0y40gc9bixj5rxv674br1rxmxkd3ly29p80x1596h8yywwcrpx7x";
60 };
61 "openai-gpt-merges.txt" = fetchurl {
62 url = "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-merges.txt";
63 sha256 = "09a754pm4djjglv3x5pkgwd6f79i2rq8ydg0f7c3q1wmwqdbba8f";
64 };
65 };
66in
67buildPythonPackage rec {
68 pname = "tokenizers";
69 version = "0.19.1";
70 pyproject = true;
71
72 disabled = pythonOlder "3.7";
73
74 src = fetchFromGitHub {
75 owner = "huggingface";
76 repo = "tokenizers";
77 rev = "refs/tags/v${version}";
78 hash = "sha256-sKEAt46cdme821tzz9WSKnQb3hPmFJ4zvHgBNRxjEuk=";
79 };
80
81 cargoDeps = rustPlatform.importCargoLock { lockFile = ./Cargo.lock; };
82
83 sourceRoot = "${src.name}/bindings/python";
84 maturinBuildFlags = [ "--interpreter ${python.executable}" ];
85
86 nativeBuildInputs = [
87 pkg-config
88 setuptools-rust
89 rustPlatform.cargoSetupHook
90 rustPlatform.maturinBuildHook
91 cargo
92 rustc
93 ];
94
95 buildInputs =
96 [ openssl ]
97 ++ lib.optionals stdenv.isDarwin [
98 libiconv
99 Security
100 ];
101
102 dependencies = [
103 numpy
104 huggingface-hub
105 ];
106
107 nativeCheckInputs = [
108 datasets
109 pytestCheckHook
110 requests
111 ];
112
113 postUnpack = ''
114 # Add data files for tests, otherwise tests attempt network access
115 mkdir $sourceRoot/tests/data
116 ln -s ${test-data}/* $sourceRoot/tests/data/
117 '';
118
119 preCheck = ''
120 export HOME=$(mktemp -d);
121 '';
122
123 pythonImportsCheck = [ "tokenizers" ];
124
125 disabledTests = [
126 # Downloads data using the datasets module
127 "test_encode_special_tokens"
128 "test_splitting"
129 "TestTrainFromIterators"
130 # Those tests require more data
131 "test_from_pretrained"
132 "test_from_pretrained_revision"
133 "test_continuing_prefix_trainer_mistmatch"
134 ];
135
136 meta = with lib; {
137 description = "Fast State-of-the-Art Tokenizers optimized for Research and Production";
138 homepage = "https://github.com/huggingface/tokenizers";
139 license = licenses.asl20;
140 maintainers = [ ];
141 platforms = platforms.unix;
142 };
143}