Clone of https://github.com/NixOS/nixpkgs.git (to stress-test knotserver)
at gcc-offload 159 lines 4.3 kB view raw
1{ 2 lib, 3 linkFarm, 4 fetchurl, 5 buildPythonPackage, 6 fetchFromGitHub, 7 python, 8 9 # nativeBuildInputs 10 cargo, 11 pkg-config, 12 rustPlatform, 13 rustc, 14 setuptools-rust, 15 16 # buildInputs 17 openssl, 18 19 # dependencies 20 huggingface-hub, 21 22 # tests 23 datasets, 24 numpy, 25 pytestCheckHook, 26 requests, 27 tiktoken, 28}: 29 30let 31 # See https://github.com/huggingface/tokenizers/blob/main/bindings/python/tests/utils.py for details 32 # about URLs and file names 33 test-data = linkFarm "tokenizers-test-data" { 34 "roberta-base-vocab.json" = fetchurl { 35 url = "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-vocab.json"; 36 hash = "sha256-nn9jwtFdZmtS4h0lDS5RO4fJtxPPpph6gu2J5eblBlU="; 37 }; 38 "roberta-base-merges.txt" = fetchurl { 39 url = "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-merges.txt"; 40 hash = "sha256-HOFmR3PFDz4MyIQmGak+3EYkUltyixiKngvjO3cmrcU="; 41 }; 42 "albert-base-v1-tokenizer.json" = fetchurl { 43 url = "https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-v1-tokenizer.json"; 44 hash = "sha256-biqj1cpMaEG8NqUCgXnLTWPBKZMfoY/OOP2zjOxNKsM="; 45 }; 46 "bert-base-uncased-vocab.txt" = fetchurl { 47 url = "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt"; 48 hash = "sha256-B+ztN1zsFE0nyQAkHz4zlHjeyVj5L928VR8pXJkgOKM="; 49 }; 50 "big.txt" = fetchurl { 51 url = "https://norvig.com/big.txt"; 52 hash = "sha256-+gZsfUDw8gGsQUTmUqpiQw5YprOAXscGUPZ42lgE6Hs="; 53 }; 54 "bert-wiki.json" = fetchurl { 55 url = "https://s3.amazonaws.com/models.huggingface.co/bert/anthony/doc-pipeline/tokenizer.json"; 56 hash = "sha256-i533xC8J5CDMNxBjo+p6avIM8UOcui8RmGAmK0GmfBc="; 57 }; 58 "tokenizer-wiki.json" = fetchurl { 59 url = "https://s3.amazonaws.com/models.huggingface.co/bert/anthony/doc-quicktour/tokenizer.json"; 60 hash = "sha256-ipY9d5DR5nxoO6kj7rItueZ9AO5wq9+Nzr6GuEIfIBI="; 61 }; 62 "openai-gpt-vocab.json" = fetchurl { 63 url = "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-vocab.json"; 64 hash = "sha256-/fSbGefeI2hSCR2gm4Sno81eew55kWN2z0X2uBJ7gHg="; 65 }; 66 "openai-gpt-merges.txt" = fetchurl { 67 url = "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-merges.txt"; 68 hash = "sha256-Dqm1GuaVBzzYceA1j3AWMR1nGn/zlj42fVI2Ui8pRyU="; 69 }; 70 }; 71in 72buildPythonPackage rec { 73 pname = "tokenizers"; 74 version = "0.21.0"; 75 pyproject = true; 76 77 src = fetchFromGitHub { 78 owner = "huggingface"; 79 repo = "tokenizers"; 80 tag = "v${version}"; 81 hash = "sha256-G65XiVlvJXOC9zqcVr9vWamUnpC0aa4kyYkE2v1K2iY="; 82 }; 83 84 cargoDeps = rustPlatform.fetchCargoTarball { 85 inherit 86 pname 87 version 88 src 89 sourceRoot 90 ; 91 hash = "sha256-5cw63ydyhpMup2tOe/hpG2W6YZ+cvT75MJBkE5Wap4s="; 92 }; 93 94 sourceRoot = "${src.name}/bindings/python"; 95 maturinBuildFlags = [ "--interpreter ${python.executable}" ]; 96 97 nativeBuildInputs = [ 98 cargo 99 pkg-config 100 rustPlatform.cargoSetupHook 101 rustPlatform.maturinBuildHook 102 rustc 103 setuptools-rust 104 ]; 105 106 buildInputs = [ 107 openssl 108 ]; 109 110 dependencies = [ 111 huggingface-hub 112 ]; 113 114 nativeCheckInputs = [ 115 datasets 116 numpy 117 pytestCheckHook 118 requests 119 tiktoken 120 ]; 121 122 postUnpack = '' 123 # Add data files for tests, otherwise tests attempt network access 124 mkdir $sourceRoot/tests/data 125 ln -s ${test-data}/* $sourceRoot/tests/data/ 126 ''; 127 128 preCheck = '' 129 export HOME=$(mktemp -d); 130 ''; 131 132 pythonImportsCheck = [ "tokenizers" ]; 133 134 disabledTests = [ 135 # Downloads data using the datasets module 136 "test_encode_special_tokens" 137 "test_splitting" 138 "TestTrainFromIterators" 139 140 # Those tests require more data 141 "test_from_pretrained" 142 "test_from_pretrained_revision" 143 "test_continuing_prefix_trainer_mistmatch" 144 ]; 145 146 disabledTestPaths = [ 147 # fixture 'model' not found 148 "benches/test_tiktoken.py" 149 ]; 150 151 meta = { 152 description = "Fast State-of-the-Art Tokenizers optimized for Research and Production"; 153 homepage = "https://github.com/huggingface/tokenizers"; 154 changelog = "https://github.com/huggingface/tokenizers/releases/tag/v${version}"; 155 license = lib.licenses.asl20; 156 maintainers = with lib.maintainers; [ GaetanLepage ]; 157 platforms = lib.platforms.unix; 158 }; 159}