1{ lib 2, fetchFromGitHub 3, fetchurl 4, buildPythonPackage 5, rustPlatform 6, setuptools-rust 7, numpy 8, datasets 9, pytestCheckHook 10, requests 11}: 12 13let 14 robertaVocab = fetchurl { 15 url = "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-vocab.json"; 16 sha256 = "0m86wpkfb2gdh9x9i9ng2fvwk1rva4p0s98xw996nrjxs7166zwy"; 17 }; 18 robertaMerges = fetchurl { 19 url = "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-merges.txt"; 20 sha256 = "1idd4rvkpqqbks51i2vjbd928inw7slij9l4r063w3y5fd3ndq8w"; 21 }; 22 albertVocab = fetchurl { 23 url = "https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-v1-tokenizer.json"; 24 sha256 = "1hra9pn8rczx7378z88zjclw2qsdrdwq20m56sy42s2crbas6akf"; 25 }; 26 bertVocab = fetchurl { 27 url = "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt"; 28 sha256 = "18rq42cmqa8zanydsbzrb34xwy4l6cz1y900r4kls57cbhvyvv07"; 29 }; 30 norvigBig = fetchurl { 31 url = "https://norvig.com/big.txt"; 32 sha256 = "0yz80icdly7na03cfpl0nfk5h3j3cam55rj486n03wph81ynq1ps"; 33 }; 34 docPipelineTokenizer = fetchurl { 35 url = "https://s3.amazonaws.com/models.huggingface.co/bert/anthony/doc-pipeline/tokenizer.json"; 36 hash = "sha256-i533xC8J5CDMNxBjo+p6avIM8UOcui8RmGAmK0GmfBc="; 37 }; 38 docQuicktourTokenizer = fetchurl { 39 url = "https://s3.amazonaws.com/models.huggingface.co/bert/anthony/doc-quicktour/tokenizer.json"; 40 hash = "sha256-ipY9d5DR5nxoO6kj7rItueZ9AO5wq9+Nzr6GuEIfIBI="; 41 }; 42 openaiVocab = fetchurl { 43 url = "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-vocab.json"; 44 sha256 = "0y40gc9bixj5rxv674br1rxmxkd3ly29p80x1596h8yywwcrpx7x"; 45 }; 46 openaiMerges = fetchurl { 47 url = "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-merges.txt"; 48 sha256 = "09a754pm4djjglv3x5pkgwd6f79i2rq8ydg0f7c3q1wmwqdbba8f"; 49 }; 50in buildPythonPackage rec { 51 pname = "tokenizers"; 52 version = "unstable-2021-08-13"; 53 54 src = fetchFromGitHub { 55 owner = "huggingface"; 56 repo = pname; 57 rev = "e7dd6436dd4a4ffd9e8a4f110ca68e6a38677cb6"; 58 sha256 = "1p7w9a43a9h6ys5nsa4g89l65dj11037p7a1lqkj4x1yc9kv2y1r"; 59 }; 60 61 cargoDeps = rustPlatform.fetchCargoTarball { 62 inherit src sourceRoot; 63 name = "${pname}-${version}"; 64 sha256 = "1yb4jsx6mp9jgd1g3mli6vr6mri2afnwqlmxq1rpvn34z6b3iw9q"; 65 }; 66 67 sourceRoot = "source/bindings/python"; 68 69 nativeBuildInputs = [ setuptools-rust ] ++ (with rustPlatform; [ 70 cargoSetupHook 71 rust.cargo 72 rust.rustc 73 ]); 74 75 propagatedBuildInputs = [ 76 numpy 77 ]; 78 79 checkInputs = [ 80 datasets 81 pytestCheckHook 82 requests 83 ]; 84 85 postUnpack = '' 86 # Add data files for tests, otherwise tests attempt network access. 87 mkdir $sourceRoot/tests/data 88 ( cd $sourceRoot/tests/data 89 ln -s ${robertaVocab} roberta-base-vocab.json 90 ln -s ${robertaMerges} roberta-base-merges.txt 91 ln -s ${albertVocab} albert-base-v1-tokenizer.json 92 ln -s ${bertVocab} bert-base-uncased-vocab.txt 93 ln -s ${docPipelineTokenizer} bert-wiki.json 94 ln -s ${docQuicktourTokenizer} tokenizer-wiki.json 95 ln -s ${norvigBig} big.txt 96 ln -s ${openaiVocab} openai-gpt-vocab.json 97 ln -s ${openaiMerges} openai-gpt-merges.txt ) 98 ''; 99 100 postPatch = '' 101 echo 'import multiprocessing; multiprocessing.set_start_method("fork")' >> tests/__init__.py 102 ''; 103 104 preCheck = '' 105 HOME=$TMPDIR 106 ''; 107 108 disabledTests = [ 109 # Downloads data using the datasets module. 110 "TestTrainFromIterators" 111 ]; 112 113 meta = with lib; { 114 homepage = "https://github.com/huggingface/tokenizers"; 115 description = "Fast State-of-the-Art Tokenizers optimized for Research and Production"; 116 license = licenses.asl20; 117 platforms = platforms.unix; 118 maintainers = with maintainers; [ ]; 119 }; 120}