1{ lib
2, stdenv
3, buildPythonPackage
4, datasets
5, fetchFromGitHub
6, fetchurl
7, libiconv
8, numpy
9, openssl
10, pkg-config
11, pytestCheckHook
12, pythonOlder
13, requests
14, rustPlatform
15, Security
16, setuptools-rust
17}:
18
19let
20 # See https://github.com/huggingface/tokenizers/blob/main/bindings/python/tests/utils.py for details
21 # about URLs and file names
22 robertaVocab = fetchurl {
23 url = "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-vocab.json";
24 sha256 = "0m86wpkfb2gdh9x9i9ng2fvwk1rva4p0s98xw996nrjxs7166zwy";
25 };
26 robertaMerges = fetchurl {
27 url = "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-merges.txt";
28 sha256 = "1idd4rvkpqqbks51i2vjbd928inw7slij9l4r063w3y5fd3ndq8w";
29 };
30 albertVocab = fetchurl {
31 url = "https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-v1-tokenizer.json";
32 sha256 = "1hra9pn8rczx7378z88zjclw2qsdrdwq20m56sy42s2crbas6akf";
33 };
34 bertVocab = fetchurl {
35 url = "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt";
36 sha256 = "18rq42cmqa8zanydsbzrb34xwy4l6cz1y900r4kls57cbhvyvv07";
37 };
38 norvigBig = fetchurl {
39 url = "https://norvig.com/big.txt";
40 sha256 = "0yz80icdly7na03cfpl0nfk5h3j3cam55rj486n03wph81ynq1ps";
41 };
42 docPipelineTokenizer = fetchurl {
43 url = "https://s3.amazonaws.com/models.huggingface.co/bert/anthony/doc-pipeline/tokenizer.json";
44 hash = "sha256-i533xC8J5CDMNxBjo+p6avIM8UOcui8RmGAmK0GmfBc=";
45 };
46 docQuicktourTokenizer = fetchurl {
47 url = "https://s3.amazonaws.com/models.huggingface.co/bert/anthony/doc-quicktour/tokenizer.json";
48 hash = "sha256-ipY9d5DR5nxoO6kj7rItueZ9AO5wq9+Nzr6GuEIfIBI=";
49 };
50 openaiVocab = fetchurl {
51 url = "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-vocab.json";
52 sha256 = "0y40gc9bixj5rxv674br1rxmxkd3ly29p80x1596h8yywwcrpx7x";
53 };
54 openaiMerges = fetchurl {
55 url = "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-merges.txt";
56 sha256 = "09a754pm4djjglv3x5pkgwd6f79i2rq8ydg0f7c3q1wmwqdbba8f";
57 };
58in
59buildPythonPackage rec {
60 pname = "tokenizers";
61 version = "0.12.1";
62
63 disabled = pythonOlder "3.7";
64
65 src = fetchFromGitHub {
66 owner = "huggingface";
67 repo = pname;
68 rev = "python-v${version}";
69 hash = "sha256-XIXKgcqa6ToAH4OkyaaJALOS9F+sD8d5Z71RttRcIsw=";
70 };
71
72 cargoDeps = rustPlatform.fetchCargoTarball {
73 inherit src sourceRoot;
74 name = "${pname}-${version}";
75 sha256 = "sha256-Euvf0LNMa2Od+6gY1Ldge/7VPrH5mJoZduRRsb+lM/E=";
76 };
77
78 sourceRoot = "source/bindings/python";
79
80 nativeBuildInputs = [
81 pkg-config
82 setuptools-rust
83 ] ++ (with rustPlatform; [
84 cargoSetupHook
85 rust.cargo
86 rust.rustc
87 ]);
88
89 buildInputs = [
90 openssl
91 ] ++ lib.optionals stdenv.isDarwin [
92 libiconv
93 Security
94 ];
95
96 propagatedBuildInputs = [
97 numpy
98 ];
99
100 checkInputs = [
101 datasets
102 pytestCheckHook
103 requests
104 ];
105
106 postUnpack = ''
107 # Add data files for tests, otherwise tests attempt network access
108 mkdir $sourceRoot/tests/data
109 ( cd $sourceRoot/tests/data
110 ln -s ${robertaVocab} roberta-base-vocab.json
111 ln -s ${robertaMerges} roberta-base-merges.txt
112 ln -s ${albertVocab} albert-base-v1-tokenizer.json
113 ln -s ${bertVocab} bert-base-uncased-vocab.txt
114 ln -s ${docPipelineTokenizer} bert-wiki.json
115 ln -s ${docQuicktourTokenizer} tokenizer-wiki.json
116 ln -s ${norvigBig} big.txt
117 ln -s ${openaiVocab} openai-gpt-vocab.json
118 ln -s ${openaiMerges} openai-gpt-merges.txt )
119 '';
120
121 preCheck = ''
122 export HOME=$(mktemp -d);
123 '';
124
125 pythonImportsCheck = [
126 "tokenizers"
127 ];
128
129 disabledTests = [
130 # Downloads data using the datasets module
131 "TestTrainFromIterators"
132 # Those tests require more data
133 "test_from_pretrained"
134 "test_from_pretrained_revision"
135 "test_continuing_prefix_trainer_mistmatch"
136 ];
137
138 meta = with lib; {
139 description = "Fast State-of-the-Art Tokenizers optimized for Research and Production";
140 homepage = "https://github.com/huggingface/tokenizers";
141 license = licenses.asl20;
142 maintainers = with maintainers; [ ];
143 platforms = platforms.unix;
144 };
145}