Clone of https://github.com/NixOS/nixpkgs.git (to stress-test knotserver)
at 20.09-beta 129 lines 3.9 kB view raw
1{ stdenv 2, rustPlatform 3, fetchFromGitHub 4, fetchurl 5, maturin 6, pipInstallHook 7, pytest 8, python 9, requests 10}: 11 12let 13 robertaVocab = fetchurl { 14 url = "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-vocab.json"; 15 sha256 = "0m86wpkfb2gdh9x9i9ng2fvwk1rva4p0s98xw996nrjxs7166zwy"; 16 }; 17 robertaMerges = fetchurl { 18 url = "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-merges.txt"; 19 sha256 = "1idd4rvkpqqbks51i2vjbd928inw7slij9l4r063w3y5fd3ndq8w"; 20 }; 21 bertVocab = fetchurl { 22 url = "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt"; 23 sha256 = "18rq42cmqa8zanydsbzrb34xwy4l6cz1y900r4kls57cbhvyvv07"; 24 }; 25 openaiVocab = fetchurl { 26 url = "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-vocab.json"; 27 sha256 = "0y40gc9bixj5rxv674br1rxmxkd3ly29p80x1596h8yywwcrpx7x"; 28 }; 29 openaiMerges = fetchurl { 30 url = "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-merges.txt"; 31 sha256 = "09a754pm4djjglv3x5pkgwd6f79i2rq8ydg0f7c3q1wmwqdbba8f"; 32 }; 33in rustPlatform.buildRustPackage rec { 34 pname = "tokenizers"; 35 version = "0.8.1"; 36 37 src = fetchFromGitHub { 38 owner = "huggingface"; 39 repo = pname; 40 rev = "python-v${version}"; 41 sha256 = "0sxdwx05hr87j2z32rk4rgwn6a26w9r7m5fgj6ah1sgagiiyxbjw"; 42 }; 43 44 # Update parking_lot to be compatible with recent Rust versions, that 45 # replace asm! by llvm_asm!: 46 # 47 # https://github.com/Amanieu/parking_lot/pull/223 48 # 49 # Remove once upstream updates this dependency. 50 cargoPatches = [ ./update-parking-lot.diff ]; 51 52 cargoSha256 = "0cdkxmj8z2wdspn6r62lqlpvd0sj1z0cmb1zpqaajxvr0b2kjlj8"; 53 54 sourceRoot = "source/bindings/python"; 55 56 nativeBuildInputs = [ 57 maturin 58 pipInstallHook 59 ]; 60 61 propagatedBuildInputs = [ 62 python 63 ]; 64 65 # tokenizers uses pyo3, which requires Rust nightly. 66 RUSTC_BOOTSTRAP = 1; 67 68 doCheck = false; 69 doInstallCheck = true; 70 71 postUnpack = '' 72 # Add data files for tests, otherwise tests attempt network access. 73 mkdir $sourceRoot/tests/data 74 ( cd $sourceRoot/tests/data 75 ln -s ${robertaVocab} roberta-base-vocab.json 76 ln -s ${robertaMerges} roberta-base-merges.txt 77 ln -s ${bertVocab} bert-base-uncased-vocab.txt 78 ln -s ${openaiVocab} openai-gpt-vocab.json 79 ln -s ${openaiMerges} openai-gpt-merges.txt ) 80 ''; 81 82 postPatch = '' 83 # pyo3's build check verifies that Rust is a nightly 84 # version. Disable this check. 85 substituteInPlace $NIX_BUILD_TOP/$cargoDepsCopy/pyo3/build.rs \ 86 --replace "check_rustc_version()?;" "" 87 88 # Patching the vendored dependency invalidates the file 89 # checksums, so remove them. This should be safe, since 90 # this is just a copy of the vendored dependencies and 91 # the integrity of the vendored dependencies is validated 92 # by cargoSha256. 93 sed -r -i 's|"files":\{[^}]+\}|"files":{}|' \ 94 $NIX_BUILD_TOP/$cargoDepsCopy/pyo3/.cargo-checksum.json 95 96 # Maturin uses the crate name as the wheel name. 97 substituteInPlace Cargo.toml \ 98 --replace "tokenizers-python" "tokenizers" 99 ''; 100 101 buildPhase = '' 102 maturin build --release --manylinux off 103 ''; 104 105 installPhase = '' 106 # Put the wheels where the pip install hook can find them. 107 install -Dm644 -t dist target/wheels/*.whl 108 pipInstallPhase 109 ''; 110 111 installCheckInputs = [ 112 pytest 113 requests 114 ]; 115 116 installCheckPhase = '' 117 # Append paths, or the binding's tokenizer module will be 118 # used, since the test directories have __init__.py 119 pytest --import-mode=append 120 ''; 121 122 meta = with stdenv.lib; { 123 homepage = "https://github.com/huggingface/tokenizers"; 124 description = "Fast State-of-the-Art Tokenizers optimized for Research and Production"; 125 license = licenses.asl20; 126 platforms = platforms.unix; 127 maintainers = with maintainers; [ danieldk ]; 128 }; 129}