Clone of https://github.com/NixOS/nixpkgs.git (to stress-test knotserver)
1{ 2 lib, 3 stdenv, 4 linkFarm, 5 buildPythonPackage, 6 cargo, 7 datasets, 8 huggingface-hub, 9 fetchFromGitHub, 10 fetchurl, 11 libiconv, 12 numpy, 13 openssl, 14 pkg-config, 15 pytestCheckHook, 16 python, 17 pythonOlder, 18 requests, 19 rustPlatform, 20 rustc, 21 Security, 22 setuptools-rust, 23}: 24 25let 26 # See https://github.com/huggingface/tokenizers/blob/main/bindings/python/tests/utils.py for details 27 # about URLs and file names 28 test-data = linkFarm "tokenizers-test-data" { 29 "roberta-base-vocab.json" = fetchurl { 30 url = "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-vocab.json"; 31 sha256 = "0m86wpkfb2gdh9x9i9ng2fvwk1rva4p0s98xw996nrjxs7166zwy"; 32 }; 33 "roberta-base-merges.txt" = fetchurl { 34 url = "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-merges.txt"; 35 sha256 = "1idd4rvkpqqbks51i2vjbd928inw7slij9l4r063w3y5fd3ndq8w"; 36 }; 37 "albert-base-v1-tokenizer.json" = fetchurl { 38 url = "https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-v1-tokenizer.json"; 39 sha256 = "1hra9pn8rczx7378z88zjclw2qsdrdwq20m56sy42s2crbas6akf"; 40 }; 41 "bert-base-uncased-vocab.txt" = fetchurl { 42 url = "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt"; 43 sha256 = "18rq42cmqa8zanydsbzrb34xwy4l6cz1y900r4kls57cbhvyvv07"; 44 }; 45 "big.txt" = fetchurl { 46 url = "https://norvig.com/big.txt"; 47 sha256 = "0yz80icdly7na03cfpl0nfk5h3j3cam55rj486n03wph81ynq1ps"; 48 }; 49 "bert-wiki.json" = fetchurl { 50 url = "https://s3.amazonaws.com/models.huggingface.co/bert/anthony/doc-pipeline/tokenizer.json"; 51 hash = "sha256-i533xC8J5CDMNxBjo+p6avIM8UOcui8RmGAmK0GmfBc="; 52 }; 53 "tokenizer-wiki.json" = fetchurl { 54 url = "https://s3.amazonaws.com/models.huggingface.co/bert/anthony/doc-quicktour/tokenizer.json"; 55 hash = "sha256-ipY9d5DR5nxoO6kj7rItueZ9AO5wq9+Nzr6GuEIfIBI="; 56 }; 57 "openai-gpt-vocab.json" = fetchurl { 58 url = "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-vocab.json"; 59 sha256 = "0y40gc9bixj5rxv674br1rxmxkd3ly29p80x1596h8yywwcrpx7x"; 60 }; 61 "openai-gpt-merges.txt" = fetchurl { 62 url = "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-merges.txt"; 63 sha256 = "09a754pm4djjglv3x5pkgwd6f79i2rq8ydg0f7c3q1wmwqdbba8f"; 64 }; 65 }; 66in 67buildPythonPackage rec { 68 pname = "tokenizers"; 69 version = "0.19.1"; 70 pyproject = true; 71 72 disabled = pythonOlder "3.7"; 73 74 src = fetchFromGitHub { 75 owner = "huggingface"; 76 repo = "tokenizers"; 77 rev = "refs/tags/v${version}"; 78 hash = "sha256-sKEAt46cdme821tzz9WSKnQb3hPmFJ4zvHgBNRxjEuk="; 79 }; 80 81 cargoDeps = rustPlatform.importCargoLock { lockFile = ./Cargo.lock; }; 82 83 sourceRoot = "${src.name}/bindings/python"; 84 maturinBuildFlags = [ "--interpreter ${python.executable}" ]; 85 86 nativeBuildInputs = [ 87 pkg-config 88 setuptools-rust 89 rustPlatform.cargoSetupHook 90 rustPlatform.maturinBuildHook 91 cargo 92 rustc 93 ]; 94 95 buildInputs = 96 [ openssl ] 97 ++ lib.optionals stdenv.isDarwin [ 98 libiconv 99 Security 100 ]; 101 102 dependencies = [ 103 numpy 104 huggingface-hub 105 ]; 106 107 nativeCheckInputs = [ 108 datasets 109 pytestCheckHook 110 requests 111 ]; 112 113 postUnpack = '' 114 # Add data files for tests, otherwise tests attempt network access 115 mkdir $sourceRoot/tests/data 116 ln -s ${test-data}/* $sourceRoot/tests/data/ 117 ''; 118 119 preCheck = '' 120 export HOME=$(mktemp -d); 121 ''; 122 123 pythonImportsCheck = [ "tokenizers" ]; 124 125 disabledTests = [ 126 # Downloads data using the datasets module 127 "test_encode_special_tokens" 128 "test_splitting" 129 "TestTrainFromIterators" 130 # Those tests require more data 131 "test_from_pretrained" 132 "test_from_pretrained_revision" 133 "test_continuing_prefix_trainer_mistmatch" 134 ]; 135 136 meta = with lib; { 137 description = "Fast State-of-the-Art Tokenizers optimized for Research and Production"; 138 homepage = "https://github.com/huggingface/tokenizers"; 139 license = licenses.asl20; 140 maintainers = [ ]; 141 platforms = platforms.unix; 142 }; 143}