1{ lib
2, stdenv
3, buildPythonPackage
4, cargo
5, datasets
6, fetchFromGitHub
7, fetchurl
8, libiconv
9, numpy
10, openssl
11, pkg-config
12, pytestCheckHook
13, pythonOlder
14, requests
15, rustPlatform
16, rustc
17, Security
18, setuptools-rust
19}:
20
21let
22 # See https://github.com/huggingface/tokenizers/blob/main/bindings/python/tests/utils.py for details
23 # about URLs and file names
24 robertaVocab = fetchurl {
25 url = "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-vocab.json";
26 sha256 = "0m86wpkfb2gdh9x9i9ng2fvwk1rva4p0s98xw996nrjxs7166zwy";
27 };
28 robertaMerges = fetchurl {
29 url = "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-merges.txt";
30 sha256 = "1idd4rvkpqqbks51i2vjbd928inw7slij9l4r063w3y5fd3ndq8w";
31 };
32 albertVocab = fetchurl {
33 url = "https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-v1-tokenizer.json";
34 sha256 = "1hra9pn8rczx7378z88zjclw2qsdrdwq20m56sy42s2crbas6akf";
35 };
36 bertVocab = fetchurl {
37 url = "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt";
38 sha256 = "18rq42cmqa8zanydsbzrb34xwy4l6cz1y900r4kls57cbhvyvv07";
39 };
40 norvigBig = fetchurl {
41 url = "https://norvig.com/big.txt";
42 sha256 = "0yz80icdly7na03cfpl0nfk5h3j3cam55rj486n03wph81ynq1ps";
43 };
44 docPipelineTokenizer = fetchurl {
45 url = "https://s3.amazonaws.com/models.huggingface.co/bert/anthony/doc-pipeline/tokenizer.json";
46 hash = "sha256-i533xC8J5CDMNxBjo+p6avIM8UOcui8RmGAmK0GmfBc=";
47 };
48 docQuicktourTokenizer = fetchurl {
49 url = "https://s3.amazonaws.com/models.huggingface.co/bert/anthony/doc-quicktour/tokenizer.json";
50 hash = "sha256-ipY9d5DR5nxoO6kj7rItueZ9AO5wq9+Nzr6GuEIfIBI=";
51 };
52 openaiVocab = fetchurl {
53 url = "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-vocab.json";
54 sha256 = "0y40gc9bixj5rxv674br1rxmxkd3ly29p80x1596h8yywwcrpx7x";
55 };
56 openaiMerges = fetchurl {
57 url = "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-merges.txt";
58 sha256 = "09a754pm4djjglv3x5pkgwd6f79i2rq8ydg0f7c3q1wmwqdbba8f";
59 };
60in
61buildPythonPackage rec {
62 pname = "tokenizers";
63 version = "0.13.3";
64
65 disabled = pythonOlder "3.7";
66
67 src = fetchFromGitHub {
68 owner = "huggingface";
69 repo = pname;
70 rev = "python-v${version}";
71 hash = "sha256-QZG5jmr3vbyQs4mVBjwVDR31O66dUM+p39R0htJ1umk=";
72 };
73
74 postPatch = ''
75 ln -s ${./Cargo.lock} Cargo.lock
76 '';
77
78 cargoDeps = rustPlatform.importCargoLock {
79 lockFile = ./Cargo.lock;
80 };
81
82 sourceRoot = "source/bindings/python";
83
84 nativeBuildInputs = [
85 pkg-config
86 setuptools-rust
87 rustPlatform.cargoSetupHook
88 cargo
89 rustc
90 ];
91
92 buildInputs = [
93 openssl
94 ] ++ lib.optionals stdenv.isDarwin [
95 libiconv
96 Security
97 ];
98
99 propagatedBuildInputs = [
100 numpy
101 ];
102
103 nativeCheckInputs = [
104 datasets
105 pytestCheckHook
106 requests
107 ];
108
109 postUnpack = ''
110 # Add data files for tests, otherwise tests attempt network access
111 mkdir $sourceRoot/tests/data
112 ( cd $sourceRoot/tests/data
113 ln -s ${robertaVocab} roberta-base-vocab.json
114 ln -s ${robertaMerges} roberta-base-merges.txt
115 ln -s ${albertVocab} albert-base-v1-tokenizer.json
116 ln -s ${bertVocab} bert-base-uncased-vocab.txt
117 ln -s ${docPipelineTokenizer} bert-wiki.json
118 ln -s ${docQuicktourTokenizer} tokenizer-wiki.json
119 ln -s ${norvigBig} big.txt
120 ln -s ${openaiVocab} openai-gpt-vocab.json
121 ln -s ${openaiMerges} openai-gpt-merges.txt )
122 '';
123
124 preCheck = ''
125 export HOME=$(mktemp -d);
126 '';
127
128 pythonImportsCheck = [
129 "tokenizers"
130 ];
131
132 disabledTests = [
133 # Downloads data using the datasets module
134 "TestTrainFromIterators"
135 # Those tests require more data
136 "test_from_pretrained"
137 "test_from_pretrained_revision"
138 "test_continuing_prefix_trainer_mistmatch"
139 ];
140
141 meta = with lib; {
142 description = "Fast State-of-the-Art Tokenizers optimized for Research and Production";
143 homepage = "https://github.com/huggingface/tokenizers";
144 license = licenses.asl20;
145 maintainers = with maintainers; [ ];
146 platforms = platforms.unix;
147 };
148}