1{ lib
2, fetchFromGitHub
3, fetchurl
4, buildPythonPackage
5, rustPlatform
6, setuptools-rust
7, numpy
8, datasets
9, pytestCheckHook
10, requests
11}:
12
13let
14 robertaVocab = fetchurl {
15 url = "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-vocab.json";
16 sha256 = "0m86wpkfb2gdh9x9i9ng2fvwk1rva4p0s98xw996nrjxs7166zwy";
17 };
18 robertaMerges = fetchurl {
19 url = "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-merges.txt";
20 sha256 = "1idd4rvkpqqbks51i2vjbd928inw7slij9l4r063w3y5fd3ndq8w";
21 };
22 albertVocab = fetchurl {
23 url = "https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-v1-tokenizer.json";
24 sha256 = "1hra9pn8rczx7378z88zjclw2qsdrdwq20m56sy42s2crbas6akf";
25 };
26 bertVocab = fetchurl {
27 url = "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt";
28 sha256 = "18rq42cmqa8zanydsbzrb34xwy4l6cz1y900r4kls57cbhvyvv07";
29 };
30 norvigBig = fetchurl {
31 url = "https://norvig.com/big.txt";
32 sha256 = "0yz80icdly7na03cfpl0nfk5h3j3cam55rj486n03wph81ynq1ps";
33 };
34 docPipelineTokenizer = fetchurl {
35 url = "https://s3.amazonaws.com/models.huggingface.co/bert/anthony/doc-pipeline/tokenizer.json";
36 hash = "sha256-i533xC8J5CDMNxBjo+p6avIM8UOcui8RmGAmK0GmfBc=";
37 };
38 docQuicktourTokenizer = fetchurl {
39 url = "https://s3.amazonaws.com/models.huggingface.co/bert/anthony/doc-quicktour/tokenizer.json";
40 hash = "sha256-ipY9d5DR5nxoO6kj7rItueZ9AO5wq9+Nzr6GuEIfIBI=";
41 };
42 openaiVocab = fetchurl {
43 url = "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-vocab.json";
44 sha256 = "0y40gc9bixj5rxv674br1rxmxkd3ly29p80x1596h8yywwcrpx7x";
45 };
46 openaiMerges = fetchurl {
47 url = "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-merges.txt";
48 sha256 = "09a754pm4djjglv3x5pkgwd6f79i2rq8ydg0f7c3q1wmwqdbba8f";
49 };
50in buildPythonPackage rec {
51 pname = "tokenizers";
52 version = "0.10.1";
53
54 src = fetchFromGitHub {
55 owner = "huggingface";
56 repo = pname;
57 rev = "python-v${version}";
58 hash = "sha256-N/dKjQwHKmJnB76q8ISQ3cjuW0Z4GqGavnFFx/w9JRQ=";
59 };
60
61 cargoDeps = rustPlatform.fetchCargoTarball {
62 inherit src sourceRoot;
63 name = "${pname}-${version}";
64 hash = "sha256-3ICSjtiRfLOj+PXu6mcuDoAtod5uXAcabYWTLxEgI18=";
65 };
66
67 sourceRoot = "source/bindings/python";
68
69 nativeBuildInputs = [ setuptools-rust ] ++ (with rustPlatform; [
70 cargoSetupHook
71 rust.cargo
72 rust.rustc
73 ]);
74
75 propagatedBuildInputs = [
76 numpy
77 ];
78
79 installCheckInputs = [
80 datasets
81 pytestCheckHook
82 requests
83 ];
84
85 doCheck = false;
86 doInstallCheck = true;
87
88 postUnpack = ''
89 # Add data files for tests, otherwise tests attempt network access.
90 mkdir $sourceRoot/tests/data
91 ( cd $sourceRoot/tests/data
92 ln -s ${robertaVocab} roberta-base-vocab.json
93 ln -s ${robertaMerges} roberta-base-merges.txt
94 ln -s ${albertVocab} albert-base-v1-tokenizer.json
95 ln -s ${bertVocab} bert-base-uncased-vocab.txt
96 ln -s ${docPipelineTokenizer} bert-wiki.json
97 ln -s ${docQuicktourTokenizer} tokenizer-wiki.json
98 ln -s ${norvigBig} big.txt
99 ln -s ${openaiVocab} openai-gpt-vocab.json
100 ln -s ${openaiMerges} openai-gpt-merges.txt )
101 '';
102
103 preCheck = ''
104 HOME=$TMPDIR
105 '';
106
107 disabledTests = [
108 # Downloads data using the datasets module.
109 "TestTrainFromIterators"
110 ];
111
112 meta = with lib; {
113 homepage = "https://github.com/huggingface/tokenizers";
114 description = "Fast State-of-the-Art Tokenizers optimized for Research and Production";
115 license = licenses.asl20;
116 platforms = platforms.unix;
117 maintainers = with maintainers; [ danieldk ];
118 };
119}