1{ 2 lib, 3 stdenv, 4 linkFarm, 5 buildPythonPackage, 6 cargo, 7 datasets, 8 huggingface-hub, 9 fetchFromGitHub, 10 fetchurl, 11 libiconv, 12 numpy, 13 openssl, 14 pkg-config, 15 pytestCheckHook, 16 python, 17 pythonOlder, 18 requests, 19 rustPlatform, 20 rustc, 21 Security, 22 setuptools-rust, 23}: 24 25let 26 # See https://github.com/huggingface/tokenizers/blob/main/bindings/python/tests/utils.py for details 27 # about URLs and file names 28 test-data = linkFarm "tokenizers-test-data" { 29 "roberta-base-vocab.json" = fetchurl { 30 url = "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-vocab.json"; 31 sha256 = "0m86wpkfb2gdh9x9i9ng2fvwk1rva4p0s98xw996nrjxs7166zwy"; 32 }; 33 "roberta-base-merges.txt" = fetchurl { 34 url = "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-merges.txt"; 35 sha256 = "1idd4rvkpqqbks51i2vjbd928inw7slij9l4r063w3y5fd3ndq8w"; 36 }; 37 "albert-base-v1-tokenizer.json" = fetchurl { 38 url = "https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-v1-tokenizer.json"; 39 sha256 = "1hra9pn8rczx7378z88zjclw2qsdrdwq20m56sy42s2crbas6akf"; 40 }; 41 "bert-base-uncased-vocab.txt" = fetchurl { 42 url = "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt"; 43 sha256 = "18rq42cmqa8zanydsbzrb34xwy4l6cz1y900r4kls57cbhvyvv07"; 44 }; 45 "big.txt" = fetchurl { 46 url = "https://norvig.com/big.txt"; 47 sha256 = "0yz80icdly7na03cfpl0nfk5h3j3cam55rj486n03wph81ynq1ps"; 48 }; 49 "bert-wiki.json" = fetchurl { 50 url = "https://s3.amazonaws.com/models.huggingface.co/bert/anthony/doc-pipeline/tokenizer.json"; 51 hash = "sha256-i533xC8J5CDMNxBjo+p6avIM8UOcui8RmGAmK0GmfBc="; 52 }; 53 "tokenizer-wiki.json" = fetchurl { 54 url = "https://s3.amazonaws.com/models.huggingface.co/bert/anthony/doc-quicktour/tokenizer.json"; 55 hash = "sha256-ipY9d5DR5nxoO6kj7rItueZ9AO5wq9+Nzr6GuEIfIBI="; 56 }; 57 "openai-gpt-vocab.json" = fetchurl { 58 url = "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-vocab.json"; 59 sha256 = "0y40gc9bixj5rxv674br1rxmxkd3ly29p80x1596h8yywwcrpx7x"; 60 }; 61 "openai-gpt-merges.txt" = fetchurl { 62 url = "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-merges.txt"; 63 sha256 = "09a754pm4djjglv3x5pkgwd6f79i2rq8ydg0f7c3q1wmwqdbba8f"; 64 }; 65 }; 66in 67buildPythonPackage rec { 68 pname = "tokenizers"; 69 version = "0.19.1"; 70 pyproject = true; 71 72 disabled = pythonOlder "3.7"; 73 74 src = fetchFromGitHub { 75 owner = "huggingface"; 76 repo = "tokenizers"; 77 rev = "refs/tags/v${version}"; 78 hash = "sha256-sKEAt46cdme821tzz9WSKnQb3hPmFJ4zvHgBNRxjEuk="; 79 }; 80 81 cargoDeps = rustPlatform.importCargoLock { lockFile = ./Cargo.lock; }; 82 83 sourceRoot = "${src.name}/bindings/python"; 84 maturinBuildFlags = [ "--interpreter ${python.executable}" ]; 85 86 nativeBuildInputs = [ 87 pkg-config 88 setuptools-rust 89 rustPlatform.cargoSetupHook 90 rustPlatform.maturinBuildHook 91 cargo 92 rustc 93 ]; 94 95 buildInputs = 96 [ openssl ] 97 ++ lib.optionals stdenv.isDarwin [ 98 libiconv 99 Security 100 ]; 101 102 # Cargo.lock is outdated 103 # TODO: remove at next release 104 preConfigure = '' 105 cargo update --offline 106 ''; 107 108 dependencies = [ 109 numpy 110 huggingface-hub 111 ]; 112 113 nativeCheckInputs = [ 114 datasets 115 pytestCheckHook 116 requests 117 ]; 118 119 postUnpack = '' 120 # Add data files for tests, otherwise tests attempt network access 121 mkdir $sourceRoot/tests/data 122 ln -s ${test-data}/* $sourceRoot/tests/data/ 123 ''; 124 125 preCheck = '' 126 export HOME=$(mktemp -d); 127 ''; 128 129 pythonImportsCheck = [ "tokenizers" ]; 130 131 disabledTests = [ 132 # Downloads data using the datasets module 133 "test_encode_special_tokens" 134 "test_splitting" 135 "TestTrainFromIterators" 136 # Those tests require more data 137 "test_from_pretrained" 138 "test_from_pretrained_revision" 139 "test_continuing_prefix_trainer_mistmatch" 140 ]; 141 142 meta = with lib; { 143 description = "Fast State-of-the-Art Tokenizers optimized for Research and Production"; 144 homepage = "https://github.com/huggingface/tokenizers"; 145 license = licenses.asl20; 146 maintainers = with maintainers; [ ]; 147 platforms = platforms.unix; 148 }; 149}