1{ lib 2, stdenv 3, buildPythonPackage 4, cargo 5, datasets 6, fetchFromGitHub 7, fetchurl 8, libiconv 9, numpy 10, openssl 11, pkg-config 12, pytestCheckHook 13, pythonOlder 14, requests 15, rustPlatform 16, rustc 17, Security 18, setuptools-rust 19}: 20 21let 22 # See https://github.com/huggingface/tokenizers/blob/main/bindings/python/tests/utils.py for details 23 # about URLs and file names 24 robertaVocab = fetchurl { 25 url = "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-vocab.json"; 26 sha256 = "0m86wpkfb2gdh9x9i9ng2fvwk1rva4p0s98xw996nrjxs7166zwy"; 27 }; 28 robertaMerges = fetchurl { 29 url = "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-merges.txt"; 30 sha256 = "1idd4rvkpqqbks51i2vjbd928inw7slij9l4r063w3y5fd3ndq8w"; 31 }; 32 albertVocab = fetchurl { 33 url = "https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-v1-tokenizer.json"; 34 sha256 = "1hra9pn8rczx7378z88zjclw2qsdrdwq20m56sy42s2crbas6akf"; 35 }; 36 bertVocab = fetchurl { 37 url = "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt"; 38 sha256 = "18rq42cmqa8zanydsbzrb34xwy4l6cz1y900r4kls57cbhvyvv07"; 39 }; 40 norvigBig = fetchurl { 41 url = "https://norvig.com/big.txt"; 42 sha256 = "0yz80icdly7na03cfpl0nfk5h3j3cam55rj486n03wph81ynq1ps"; 43 }; 44 docPipelineTokenizer = fetchurl { 45 url = "https://s3.amazonaws.com/models.huggingface.co/bert/anthony/doc-pipeline/tokenizer.json"; 46 hash = "sha256-i533xC8J5CDMNxBjo+p6avIM8UOcui8RmGAmK0GmfBc="; 47 }; 48 docQuicktourTokenizer = fetchurl { 49 url = "https://s3.amazonaws.com/models.huggingface.co/bert/anthony/doc-quicktour/tokenizer.json"; 50 hash = "sha256-ipY9d5DR5nxoO6kj7rItueZ9AO5wq9+Nzr6GuEIfIBI="; 51 }; 52 openaiVocab = fetchurl { 53 url = "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-vocab.json"; 54 sha256 = "0y40gc9bixj5rxv674br1rxmxkd3ly29p80x1596h8yywwcrpx7x"; 55 }; 56 openaiMerges = fetchurl { 57 url = "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-merges.txt"; 58 sha256 = "09a754pm4djjglv3x5pkgwd6f79i2rq8ydg0f7c3q1wmwqdbba8f"; 59 }; 60in 61buildPythonPackage rec { 62 pname = "tokenizers"; 63 version = "0.14.1"; 64 format = "pyproject"; 65 66 disabled = pythonOlder "3.7"; 67 68 src = fetchFromGitHub { 69 owner = "huggingface"; 70 repo = pname; 71 rev = "v${version}"; 72 hash = "sha256-cq7dQLttNkV5UUhXujxKKMuzhD7hz+zTTKxUKlvz1s0="; 73 }; 74 75 cargoDeps = rustPlatform.importCargoLock { 76 lockFile = ./Cargo.lock; 77 }; 78 79 sourceRoot = "${src.name}/bindings/python"; 80 81 nativeBuildInputs = [ 82 pkg-config 83 setuptools-rust 84 rustPlatform.cargoSetupHook 85 rustPlatform.maturinBuildHook 86 cargo 87 rustc 88 ]; 89 90 buildInputs = [ 91 openssl 92 ] ++ lib.optionals stdenv.isDarwin [ 93 libiconv 94 Security 95 ]; 96 97 propagatedBuildInputs = [ 98 numpy 99 ]; 100 101 nativeCheckInputs = [ 102 datasets 103 pytestCheckHook 104 requests 105 ]; 106 107 postUnpack = '' 108 # Add data files for tests, otherwise tests attempt network access 109 mkdir $sourceRoot/tests/data 110 ( cd $sourceRoot/tests/data 111 ln -s ${robertaVocab} roberta-base-vocab.json 112 ln -s ${robertaMerges} roberta-base-merges.txt 113 ln -s ${albertVocab} albert-base-v1-tokenizer.json 114 ln -s ${bertVocab} bert-base-uncased-vocab.txt 115 ln -s ${docPipelineTokenizer} bert-wiki.json 116 ln -s ${docQuicktourTokenizer} tokenizer-wiki.json 117 ln -s ${norvigBig} big.txt 118 ln -s ${openaiVocab} openai-gpt-vocab.json 119 ln -s ${openaiMerges} openai-gpt-merges.txt ) 120 ''; 121 122 preCheck = '' 123 export HOME=$(mktemp -d); 124 ''; 125 126 pythonImportsCheck = [ 127 "tokenizers" 128 ]; 129 130 disabledTests = [ 131 # Downloads data using the datasets module 132 "TestTrainFromIterators" 133 # Those tests require more data 134 "test_from_pretrained" 135 "test_from_pretrained_revision" 136 "test_continuing_prefix_trainer_mistmatch" 137 ]; 138 139 meta = with lib; { 140 description = "Fast State-of-the-Art Tokenizers optimized for Research and Production"; 141 homepage = "https://github.com/huggingface/tokenizers"; 142 license = licenses.asl20; 143 maintainers = with maintainers; [ ]; 144 platforms = platforms.unix; 145 }; 146}