Clone of https://github.com/NixOS/nixpkgs.git (to stress-test knotserver)
1{ 2 lib, 3 linkFarm, 4 fetchurl, 5 buildPythonPackage, 6 fetchFromGitHub, 7 8 # nativeBuildInputs 9 cargo, 10 pkg-config, 11 rustPlatform, 12 rustc, 13 setuptools-rust, 14 15 # buildInputs 16 openssl, 17 18 # dependencies 19 huggingface-hub, 20 21 # tests 22 datasets, 23 numpy, 24 pytestCheckHook, 25 requests, 26 tiktoken, 27 writableTmpDirAsHomeHook, 28}: 29 30let 31 # See https://github.com/huggingface/tokenizers/blob/main/bindings/python/tests/utils.py for details 32 # about URLs and file names 33 test-data = linkFarm "tokenizers-test-data" { 34 "roberta-base-vocab.json" = fetchurl { 35 url = "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-vocab.json"; 36 hash = "sha256-nn9jwtFdZmtS4h0lDS5RO4fJtxPPpph6gu2J5eblBlU="; 37 }; 38 "roberta-base-merges.txt" = fetchurl { 39 url = "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-merges.txt"; 40 hash = "sha256-HOFmR3PFDz4MyIQmGak+3EYkUltyixiKngvjO3cmrcU="; 41 }; 42 "albert-base-v1-tokenizer.json" = fetchurl { 43 url = "https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-v1-tokenizer.json"; 44 hash = "sha256-biqj1cpMaEG8NqUCgXnLTWPBKZMfoY/OOP2zjOxNKsM="; 45 }; 46 "bert-base-uncased-vocab.txt" = fetchurl { 47 url = "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt"; 48 hash = "sha256-B+ztN1zsFE0nyQAkHz4zlHjeyVj5L928VR8pXJkgOKM="; 49 }; 50 "tokenizer-llama3.json" = fetchurl { 51 url = "https://huggingface.co/Narsil/llama-tokenizer/resolve/main/tokenizer.json"; 52 hash = "sha256-eePlImNfMXEwCRO7QhRkqH3mIiGCoFcLmyzLoqlksrQ="; 53 }; 54 "big.txt" = fetchurl { 55 url = "https://norvig.com/big.txt"; 56 hash = "sha256-+gZsfUDw8gGsQUTmUqpiQw5YprOAXscGUPZ42lgE6Hs="; 57 }; 58 "bert-wiki.json" = fetchurl { 59 url = "https://s3.amazonaws.com/models.huggingface.co/bert/anthony/doc-pipeline/tokenizer.json"; 60 hash = "sha256-i533xC8J5CDMNxBjo+p6avIM8UOcui8RmGAmK0GmfBc="; 61 }; 62 "tokenizer-wiki.json" = fetchurl { 63 url = "https://s3.amazonaws.com/models.huggingface.co/bert/anthony/doc-quicktour/tokenizer.json"; 64 hash = "sha256-ipY9d5DR5nxoO6kj7rItueZ9AO5wq9+Nzr6GuEIfIBI="; 65 }; 66 "openai-gpt-vocab.json" = fetchurl { 67 url = "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-vocab.json"; 68 hash = "sha256-/fSbGefeI2hSCR2gm4Sno81eew55kWN2z0X2uBJ7gHg="; 69 }; 70 "openai-gpt-merges.txt" = fetchurl { 71 url = "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-merges.txt"; 72 hash = "sha256-Dqm1GuaVBzzYceA1j3AWMR1nGn/zlj42fVI2Ui8pRyU="; 73 }; 74 }; 75in 76buildPythonPackage rec { 77 pname = "tokenizers"; 78 version = "0.21.3"; 79 pyproject = true; 80 81 src = fetchFromGitHub { 82 owner = "huggingface"; 83 repo = "tokenizers"; 84 tag = "v${version}"; 85 hash = "sha256-8z1jgH0Nj7D+joN42AA2ORNSLvcfWiYHn4dpTq1HWB0="; 86 }; 87 88 postPatch = '' 89 ln -s ${./Cargo.lock} Cargo.lock 90 ''; 91 cargoDeps = rustPlatform.importCargoLock { 92 lockFile = ./Cargo.lock; 93 }; 94 95 sourceRoot = "${src.name}/bindings/python"; 96 97 nativeBuildInputs = [ 98 cargo 99 pkg-config 100 rustPlatform.cargoSetupHook 101 rustPlatform.maturinBuildHook 102 rustc 103 setuptools-rust 104 ]; 105 106 buildInputs = [ 107 openssl 108 ]; 109 110 dependencies = [ 111 huggingface-hub 112 ]; 113 114 nativeCheckInputs = [ 115 datasets 116 numpy 117 pytestCheckHook 118 requests 119 tiktoken 120 writableTmpDirAsHomeHook 121 ]; 122 123 postUnpack = 124 # Add data files for tests, otherwise tests attempt network access 125 '' 126 mkdir $sourceRoot/tests/data 127 ln -s ${test-data}/* $sourceRoot/tests/data/ 128 ''; 129 130 pythonImportsCheck = [ "tokenizers" ]; 131 132 disabledTests = [ 133 # Downloads data using the datasets module 134 "test_encode_special_tokens" 135 "test_splitting" 136 "TestTrainFromIterators" 137 138 # Those tests require more data 139 "test_from_pretrained" 140 "test_from_pretrained_revision" 141 "test_continuing_prefix_trainer_mistmatch" 142 ]; 143 144 disabledTestPaths = [ 145 # fixture 'model' not found 146 "benches/test_tiktoken.py" 147 ]; 148 149 meta = { 150 description = "Fast State-of-the-Art Tokenizers optimized for Research and Production"; 151 homepage = "https://github.com/huggingface/tokenizers"; 152 changelog = "https://github.com/huggingface/tokenizers/releases/tag/v${version}"; 153 license = lib.licenses.asl20; 154 maintainers = with lib.maintainers; [ GaetanLepage ]; 155 platforms = lib.platforms.unix; 156 }; 157}