1{ lib 2, stdenv 3, buildPythonPackage 4, datasets 5, fetchFromGitHub 6, fetchurl 7, libiconv 8, numpy 9, openssl 10, pkg-config 11, pytestCheckHook 12, pythonOlder 13, requests 14, rustPlatform 15, Security 16, setuptools-rust 17}: 18 19let 20 # See https://github.com/huggingface/tokenizers/blob/main/bindings/python/tests/utils.py for details 21 # about URLs and file names 22 robertaVocab = fetchurl { 23 url = "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-vocab.json"; 24 sha256 = "0m86wpkfb2gdh9x9i9ng2fvwk1rva4p0s98xw996nrjxs7166zwy"; 25 }; 26 robertaMerges = fetchurl { 27 url = "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-merges.txt"; 28 sha256 = "1idd4rvkpqqbks51i2vjbd928inw7slij9l4r063w3y5fd3ndq8w"; 29 }; 30 albertVocab = fetchurl { 31 url = "https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-v1-tokenizer.json"; 32 sha256 = "1hra9pn8rczx7378z88zjclw2qsdrdwq20m56sy42s2crbas6akf"; 33 }; 34 bertVocab = fetchurl { 35 url = "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt"; 36 sha256 = "18rq42cmqa8zanydsbzrb34xwy4l6cz1y900r4kls57cbhvyvv07"; 37 }; 38 norvigBig = fetchurl { 39 url = "https://norvig.com/big.txt"; 40 sha256 = "0yz80icdly7na03cfpl0nfk5h3j3cam55rj486n03wph81ynq1ps"; 41 }; 42 docPipelineTokenizer = fetchurl { 43 url = "https://s3.amazonaws.com/models.huggingface.co/bert/anthony/doc-pipeline/tokenizer.json"; 44 hash = "sha256-i533xC8J5CDMNxBjo+p6avIM8UOcui8RmGAmK0GmfBc="; 45 }; 46 docQuicktourTokenizer = fetchurl { 47 url = "https://s3.amazonaws.com/models.huggingface.co/bert/anthony/doc-quicktour/tokenizer.json"; 48 hash = "sha256-ipY9d5DR5nxoO6kj7rItueZ9AO5wq9+Nzr6GuEIfIBI="; 49 }; 50 openaiVocab = fetchurl { 51 url = "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-vocab.json"; 52 sha256 = "0y40gc9bixj5rxv674br1rxmxkd3ly29p80x1596h8yywwcrpx7x"; 53 }; 54 openaiMerges = fetchurl { 55 url = "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-merges.txt"; 56 sha256 = "09a754pm4djjglv3x5pkgwd6f79i2rq8ydg0f7c3q1wmwqdbba8f"; 57 }; 58in 59buildPythonPackage rec { 60 pname = "tokenizers"; 61 version = "0.12.1"; 62 63 disabled = pythonOlder "3.7"; 64 65 src = fetchFromGitHub { 66 owner = "huggingface"; 67 repo = pname; 68 rev = "python-v${version}"; 69 hash = "sha256-XIXKgcqa6ToAH4OkyaaJALOS9F+sD8d5Z71RttRcIsw="; 70 }; 71 72 cargoDeps = rustPlatform.fetchCargoTarball { 73 inherit src sourceRoot; 74 name = "${pname}-${version}"; 75 sha256 = "sha256-Euvf0LNMa2Od+6gY1Ldge/7VPrH5mJoZduRRsb+lM/E="; 76 }; 77 78 sourceRoot = "source/bindings/python"; 79 80 nativeBuildInputs = [ 81 pkg-config 82 setuptools-rust 83 ] ++ (with rustPlatform; [ 84 cargoSetupHook 85 rust.cargo 86 rust.rustc 87 ]); 88 89 buildInputs = [ 90 openssl 91 ] ++ lib.optionals stdenv.isDarwin [ 92 libiconv 93 Security 94 ]; 95 96 propagatedBuildInputs = [ 97 numpy 98 ]; 99 100 checkInputs = [ 101 datasets 102 pytestCheckHook 103 requests 104 ]; 105 106 postUnpack = '' 107 # Add data files for tests, otherwise tests attempt network access 108 mkdir $sourceRoot/tests/data 109 ( cd $sourceRoot/tests/data 110 ln -s ${robertaVocab} roberta-base-vocab.json 111 ln -s ${robertaMerges} roberta-base-merges.txt 112 ln -s ${albertVocab} albert-base-v1-tokenizer.json 113 ln -s ${bertVocab} bert-base-uncased-vocab.txt 114 ln -s ${docPipelineTokenizer} bert-wiki.json 115 ln -s ${docQuicktourTokenizer} tokenizer-wiki.json 116 ln -s ${norvigBig} big.txt 117 ln -s ${openaiVocab} openai-gpt-vocab.json 118 ln -s ${openaiMerges} openai-gpt-merges.txt ) 119 ''; 120 121 preCheck = '' 122 export HOME=$(mktemp -d); 123 ''; 124 125 pythonImportsCheck = [ 126 "tokenizers" 127 ]; 128 129 disabledTests = [ 130 # Downloads data using the datasets module 131 "TestTrainFromIterators" 132 # Those tests require more data 133 "test_from_pretrained" 134 "test_from_pretrained_revision" 135 "test_continuing_prefix_trainer_mistmatch" 136 ]; 137 138 meta = with lib; { 139 description = "Fast State-of-the-Art Tokenizers optimized for Research and Production"; 140 homepage = "https://github.com/huggingface/tokenizers"; 141 license = licenses.asl20; 142 maintainers = with maintainers; [ ]; 143 platforms = platforms.unix; 144 }; 145}