1{ 2 lib, 3 linkFarm, 4 fetchurl, 5 buildPythonPackage, 6 fetchFromGitHub, 7 8 # nativeBuildInputs 9 cargo, 10 pkg-config, 11 rustPlatform, 12 rustc, 13 setuptools-rust, 14 15 # buildInputs 16 openssl, 17 18 # dependencies 19 huggingface-hub, 20 21 # tests 22 datasets, 23 numpy, 24 pytestCheckHook, 25 requests, 26 tiktoken, 27 writableTmpDirAsHomeHook, 28}: 29 30let 31 # See https://github.com/huggingface/tokenizers/blob/main/bindings/python/tests/utils.py for details 32 # about URLs and file names 33 test-data = linkFarm "tokenizers-test-data" { 34 "roberta-base-vocab.json" = fetchurl { 35 url = "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-vocab.json"; 36 hash = "sha256-nn9jwtFdZmtS4h0lDS5RO4fJtxPPpph6gu2J5eblBlU="; 37 }; 38 "roberta-base-merges.txt" = fetchurl { 39 url = "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-merges.txt"; 40 hash = "sha256-HOFmR3PFDz4MyIQmGak+3EYkUltyixiKngvjO3cmrcU="; 41 }; 42 "albert-base-v1-tokenizer.json" = fetchurl { 43 url = "https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-v1-tokenizer.json"; 44 hash = "sha256-biqj1cpMaEG8NqUCgXnLTWPBKZMfoY/OOP2zjOxNKsM="; 45 }; 46 "bert-base-uncased-vocab.txt" = fetchurl { 47 url = "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt"; 48 hash = "sha256-B+ztN1zsFE0nyQAkHz4zlHjeyVj5L928VR8pXJkgOKM="; 49 }; 50 "big.txt" = fetchurl { 51 url = "https://norvig.com/big.txt"; 52 hash = "sha256-+gZsfUDw8gGsQUTmUqpiQw5YprOAXscGUPZ42lgE6Hs="; 53 }; 54 "bert-wiki.json" = fetchurl { 55 url = "https://s3.amazonaws.com/models.huggingface.co/bert/anthony/doc-pipeline/tokenizer.json"; 56 hash = "sha256-i533xC8J5CDMNxBjo+p6avIM8UOcui8RmGAmK0GmfBc="; 57 }; 58 "tokenizer-wiki.json" = fetchurl { 59 url = "https://s3.amazonaws.com/models.huggingface.co/bert/anthony/doc-quicktour/tokenizer.json"; 60 hash = "sha256-ipY9d5DR5nxoO6kj7rItueZ9AO5wq9+Nzr6GuEIfIBI="; 61 }; 62 "openai-gpt-vocab.json" = fetchurl { 63 url = "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-vocab.json"; 64 hash = "sha256-/fSbGefeI2hSCR2gm4Sno81eew55kWN2z0X2uBJ7gHg="; 65 }; 66 "openai-gpt-merges.txt" = fetchurl { 67 url = "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-merges.txt"; 68 hash = "sha256-Dqm1GuaVBzzYceA1j3AWMR1nGn/zlj42fVI2Ui8pRyU="; 69 }; 70 }; 71in 72buildPythonPackage rec { 73 pname = "tokenizers"; 74 version = "0.21.1"; 75 pyproject = true; 76 77 src = fetchFromGitHub { 78 owner = "huggingface"; 79 repo = "tokenizers"; 80 tag = "v${version}"; 81 hash = "sha256-3S7ZCaZnnwyNjoZ4Y/q3ngQE2MIm2iyCCjYAkdMVG2A="; 82 }; 83 84 # TestUnigram.test_continuing_prefix_trainer_mismatch fails with: 85 # Exception: No such file or directory (os error 2) 86 # Fix submitted upstream: https://github.com/huggingface/tokenizers/pull/1747 87 postPatch = '' 88 substituteInPlace tests/bindings/test_trainers.py \ 89 --replace-fail '"data/' '"tests/data/' 90 ''; 91 92 cargoDeps = rustPlatform.fetchCargoVendor { 93 inherit 94 pname 95 version 96 src 97 sourceRoot 98 ; 99 hash = "sha256-I7LlBmeVY2rWI0ta6x311iAurQKuutsClrbUgkt9xWk="; 100 }; 101 102 sourceRoot = "${src.name}/bindings/python"; 103 104 nativeBuildInputs = [ 105 cargo 106 pkg-config 107 rustPlatform.cargoSetupHook 108 rustPlatform.maturinBuildHook 109 rustc 110 setuptools-rust 111 ]; 112 113 buildInputs = [ 114 openssl 115 ]; 116 117 dependencies = [ 118 huggingface-hub 119 ]; 120 121 nativeCheckInputs = [ 122 datasets 123 numpy 124 pytestCheckHook 125 requests 126 tiktoken 127 writableTmpDirAsHomeHook 128 ]; 129 130 postUnpack = 131 # Add data files for tests, otherwise tests attempt network access 132 '' 133 mkdir $sourceRoot/tests/data 134 ln -s ${test-data}/* $sourceRoot/tests/data/ 135 ''; 136 137 pythonImportsCheck = [ "tokenizers" ]; 138 139 disabledTests = [ 140 # Downloads data using the datasets module 141 "test_encode_special_tokens" 142 "test_splitting" 143 "TestTrainFromIterators" 144 145 # Those tests require more data 146 "test_from_pretrained" 147 "test_from_pretrained_revision" 148 "test_continuing_prefix_trainer_mistmatch" 149 ]; 150 151 disabledTestPaths = [ 152 # fixture 'model' not found 153 "benches/test_tiktoken.py" 154 ]; 155 156 meta = { 157 description = "Fast State-of-the-Art Tokenizers optimized for Research and Production"; 158 homepage = "https://github.com/huggingface/tokenizers"; 159 changelog = "https://github.com/huggingface/tokenizers/releases/tag/v${version}"; 160 license = lib.licenses.asl20; 161 maintainers = with lib.maintainers; [ GaetanLepage ]; 162 platforms = lib.platforms.unix; 163 }; 164}