1{ stdenv
2, rustPlatform
3, fetchFromGitHub
4, fetchurl
5, maturin
6, pipInstallHook
7, pytest
8, python
9, requests
10}:
11
12let
13 robertaVocab = fetchurl {
14 url = "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-vocab.json";
15 sha256 = "0m86wpkfb2gdh9x9i9ng2fvwk1rva4p0s98xw996nrjxs7166zwy";
16 };
17 robertaMerges = fetchurl {
18 url = "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-merges.txt";
19 sha256 = "1idd4rvkpqqbks51i2vjbd928inw7slij9l4r063w3y5fd3ndq8w";
20 };
21 bertVocab = fetchurl {
22 url = "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt";
23 sha256 = "18rq42cmqa8zanydsbzrb34xwy4l6cz1y900r4kls57cbhvyvv07";
24 };
25 openaiVocab = fetchurl {
26 url = "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-vocab.json";
27 sha256 = "0y40gc9bixj5rxv674br1rxmxkd3ly29p80x1596h8yywwcrpx7x";
28 };
29 openaiMerges = fetchurl {
30 url = "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-merges.txt";
31 sha256 = "09a754pm4djjglv3x5pkgwd6f79i2rq8ydg0f7c3q1wmwqdbba8f";
32 };
33in rustPlatform.buildRustPackage rec {
34 pname = "tokenizers";
35 version = "0.8.1";
36
37 src = fetchFromGitHub {
38 owner = "huggingface";
39 repo = pname;
40 rev = "python-v${version}";
41 sha256 = "0sxdwx05hr87j2z32rk4rgwn6a26w9r7m5fgj6ah1sgagiiyxbjw";
42 };
43
44 # Update parking_lot to be compatible with recent Rust versions, that
45 # replace asm! by llvm_asm!:
46 #
47 # https://github.com/Amanieu/parking_lot/pull/223
48 #
49 # Remove once upstream updates this dependency.
50 cargoPatches = [ ./update-parking-lot.diff ];
51
52 cargoSha256 = "0cdkxmj8z2wdspn6r62lqlpvd0sj1z0cmb1zpqaajxvr0b2kjlj8";
53
54 sourceRoot = "source/bindings/python";
55
56 nativeBuildInputs = [
57 maturin
58 pipInstallHook
59 ];
60
61 propagatedBuildInputs = [
62 python
63 ];
64
65 # tokenizers uses pyo3, which requires Rust nightly.
66 RUSTC_BOOTSTRAP = 1;
67
68 doCheck = false;
69 doInstallCheck = true;
70
71 postUnpack = ''
72 # Add data files for tests, otherwise tests attempt network access.
73 mkdir $sourceRoot/tests/data
74 ( cd $sourceRoot/tests/data
75 ln -s ${robertaVocab} roberta-base-vocab.json
76 ln -s ${robertaMerges} roberta-base-merges.txt
77 ln -s ${bertVocab} bert-base-uncased-vocab.txt
78 ln -s ${openaiVocab} openai-gpt-vocab.json
79 ln -s ${openaiMerges} openai-gpt-merges.txt )
80 '';
81
82 postPatch = ''
83 # pyo3's build check verifies that Rust is a nightly
84 # version. Disable this check.
85 substituteInPlace $NIX_BUILD_TOP/$cargoDepsCopy/pyo3/build.rs \
86 --replace "check_rustc_version()?;" ""
87
88 # Patching the vendored dependency invalidates the file
89 # checksums, so remove them. This should be safe, since
90 # this is just a copy of the vendored dependencies and
91 # the integrity of the vendored dependencies is validated
92 # by cargoSha256.
93 sed -r -i 's|"files":\{[^}]+\}|"files":{}|' \
94 $NIX_BUILD_TOP/$cargoDepsCopy/pyo3/.cargo-checksum.json
95
96 # Maturin uses the crate name as the wheel name.
97 substituteInPlace Cargo.toml \
98 --replace "tokenizers-python" "tokenizers"
99 '';
100
101 buildPhase = ''
102 maturin build --release --manylinux off
103 '';
104
105 installPhase = ''
106 # Put the wheels where the pip install hook can find them.
107 install -Dm644 -t dist target/wheels/*.whl
108 pipInstallPhase
109 '';
110
111 installCheckInputs = [
112 pytest
113 requests
114 ];
115
116 installCheckPhase = ''
117 # Append paths, or the binding's tokenizer module will be
118 # used, since the test directories have __init__.py
119 pytest --import-mode=append
120 '';
121
122 meta = with stdenv.lib; {
123 homepage = "https://github.com/huggingface/tokenizers";
124 description = "Fast State-of-the-Art Tokenizers optimized for Research and Production";
125 license = licenses.asl20;
126 platforms = platforms.unix;
127 maintainers = with maintainers; [ danieldk ];
128 };
129}