Clone of https://github.com/NixOS/nixpkgs.git (to stress-test knotserver)
at litex 4.1 kB view raw
1{ lib 2, stdenv 3, buildPythonPackage 4, cargo 5, datasets 6, fetchFromGitHub 7, fetchurl 8, libiconv 9, numpy 10, openssl 11, pkg-config 12, pytestCheckHook 13, pythonOlder 14, requests 15, rustPlatform 16, rustc 17, Security 18, setuptools-rust 19}: 20 21let 22 # See https://github.com/huggingface/tokenizers/blob/main/bindings/python/tests/utils.py for details 23 # about URLs and file names 24 robertaVocab = fetchurl { 25 url = "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-vocab.json"; 26 sha256 = "0m86wpkfb2gdh9x9i9ng2fvwk1rva4p0s98xw996nrjxs7166zwy"; 27 }; 28 robertaMerges = fetchurl { 29 url = "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-merges.txt"; 30 sha256 = "1idd4rvkpqqbks51i2vjbd928inw7slij9l4r063w3y5fd3ndq8w"; 31 }; 32 albertVocab = fetchurl { 33 url = "https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-v1-tokenizer.json"; 34 sha256 = "1hra9pn8rczx7378z88zjclw2qsdrdwq20m56sy42s2crbas6akf"; 35 }; 36 bertVocab = fetchurl { 37 url = "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt"; 38 sha256 = "18rq42cmqa8zanydsbzrb34xwy4l6cz1y900r4kls57cbhvyvv07"; 39 }; 40 norvigBig = fetchurl { 41 url = "https://norvig.com/big.txt"; 42 sha256 = "0yz80icdly7na03cfpl0nfk5h3j3cam55rj486n03wph81ynq1ps"; 43 }; 44 docPipelineTokenizer = fetchurl { 45 url = "https://s3.amazonaws.com/models.huggingface.co/bert/anthony/doc-pipeline/tokenizer.json"; 46 hash = "sha256-i533xC8J5CDMNxBjo+p6avIM8UOcui8RmGAmK0GmfBc="; 47 }; 48 docQuicktourTokenizer = fetchurl { 49 url = "https://s3.amazonaws.com/models.huggingface.co/bert/anthony/doc-quicktour/tokenizer.json"; 50 hash = "sha256-ipY9d5DR5nxoO6kj7rItueZ9AO5wq9+Nzr6GuEIfIBI="; 51 }; 52 openaiVocab = fetchurl { 53 url = "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-vocab.json"; 54 sha256 = "0y40gc9bixj5rxv674br1rxmxkd3ly29p80x1596h8yywwcrpx7x"; 55 }; 56 openaiMerges = fetchurl { 57 url = "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-merges.txt"; 58 sha256 = "09a754pm4djjglv3x5pkgwd6f79i2rq8ydg0f7c3q1wmwqdbba8f"; 59 }; 60in 61buildPythonPackage rec { 62 pname = "tokenizers"; 63 version = "0.13.3"; 64 65 disabled = pythonOlder "3.7"; 66 67 src = fetchFromGitHub { 68 owner = "huggingface"; 69 repo = pname; 70 rev = "python-v${version}"; 71 hash = "sha256-QZG5jmr3vbyQs4mVBjwVDR31O66dUM+p39R0htJ1umk="; 72 }; 73 74 postPatch = '' 75 ln -s ${./Cargo.lock} Cargo.lock 76 ''; 77 78 cargoDeps = rustPlatform.importCargoLock { 79 lockFile = ./Cargo.lock; 80 }; 81 82 sourceRoot = "source/bindings/python"; 83 84 nativeBuildInputs = [ 85 pkg-config 86 setuptools-rust 87 rustPlatform.cargoSetupHook 88 cargo 89 rustc 90 ]; 91 92 buildInputs = [ 93 openssl 94 ] ++ lib.optionals stdenv.isDarwin [ 95 libiconv 96 Security 97 ]; 98 99 propagatedBuildInputs = [ 100 numpy 101 ]; 102 103 nativeCheckInputs = [ 104 datasets 105 pytestCheckHook 106 requests 107 ]; 108 109 postUnpack = '' 110 # Add data files for tests, otherwise tests attempt network access 111 mkdir $sourceRoot/tests/data 112 ( cd $sourceRoot/tests/data 113 ln -s ${robertaVocab} roberta-base-vocab.json 114 ln -s ${robertaMerges} roberta-base-merges.txt 115 ln -s ${albertVocab} albert-base-v1-tokenizer.json 116 ln -s ${bertVocab} bert-base-uncased-vocab.txt 117 ln -s ${docPipelineTokenizer} bert-wiki.json 118 ln -s ${docQuicktourTokenizer} tokenizer-wiki.json 119 ln -s ${norvigBig} big.txt 120 ln -s ${openaiVocab} openai-gpt-vocab.json 121 ln -s ${openaiMerges} openai-gpt-merges.txt ) 122 ''; 123 124 preCheck = '' 125 export HOME=$(mktemp -d); 126 ''; 127 128 pythonImportsCheck = [ 129 "tokenizers" 130 ]; 131 132 disabledTests = [ 133 # Downloads data using the datasets module 134 "TestTrainFromIterators" 135 # Those tests require more data 136 "test_from_pretrained" 137 "test_from_pretrained_revision" 138 "test_continuing_prefix_trainer_mistmatch" 139 ]; 140 141 meta = with lib; { 142 description = "Fast State-of-the-Art Tokenizers optimized for Research and Production"; 143 homepage = "https://github.com/huggingface/tokenizers"; 144 license = licenses.asl20; 145 maintainers = with maintainers; [ ]; 146 platforms = platforms.unix; 147 }; 148}