1{
2 lib,
3 aiohttp,
4 buildPythonPackage,
5 dill,
6 fetchFromGitHub,
7 fsspec,
8 huggingface-hub,
9 importlib-metadata,
10 multiprocess,
11 numpy,
12 packaging,
13 pandas,
14 pyarrow,
15 pythonOlder,
16 requests,
17 responses,
18 tqdm,
19 xxhash,
20}:
21
22buildPythonPackage rec {
23 pname = "datasets";
24 version = "2.20.0";
25 format = "setuptools";
26
27 disabled = pythonOlder "3.8";
28
29 src = fetchFromGitHub {
30 owner = "huggingface";
31 repo = pname;
32 rev = "refs/tags/${version}";
33 hash = "sha256-9mB4RXJVkmaK+fLEmyZAdf64YKGoAhE3RzMoj4/8K98=";
34 };
35
36 # remove pyarrow<14.0.1 vulnerability fix
37 postPatch = ''
38 substituteInPlace src/datasets/features/features.py \
39 --replace "import pyarrow_hotfix" "#import pyarrow_hotfix"
40 '';
41
42 propagatedBuildInputs = [
43 aiohttp
44 dill
45 fsspec
46 huggingface-hub
47 multiprocess
48 numpy
49 packaging
50 pandas
51 pyarrow
52 requests
53 responses
54 tqdm
55 xxhash
56 ] ++ lib.optionals (pythonOlder "3.8") [ importlib-metadata ];
57
58 # Tests require pervasive internet access
59 doCheck = false;
60
61 # Module import will attempt to create a cache directory
62 postFixup = "export HF_MODULES_CACHE=$TMPDIR";
63
64 pythonImportsCheck = [ "datasets" ];
65
66 meta = with lib; {
67 description = "Open-access datasets and evaluation metrics for natural language processing";
68 mainProgram = "datasets-cli";
69 homepage = "https://github.com/huggingface/datasets";
70 changelog = "https://github.com/huggingface/datasets/releases/tag/${version}";
71 license = licenses.asl20;
72 platforms = platforms.unix;
73 maintainers = [ ];
74 };
75}