1{
2 lib,
3 aiohttp,
4 buildPythonPackage,
5 dill,
6 fetchFromGitHub,
7 fetchpatch,
8 fsspec,
9 huggingface-hub,
10 importlib-metadata,
11 multiprocess,
12 numpy,
13 packaging,
14 pandas,
15 pyarrow,
16 pythonOlder,
17 requests,
18 responses,
19 tqdm,
20 xxhash,
21}:
22
23buildPythonPackage rec {
24 pname = "datasets";
25 version = "2.19.0";
26 format = "setuptools";
27
28 disabled = pythonOlder "3.8";
29
30 src = fetchFromGitHub {
31 owner = "huggingface";
32 repo = pname;
33 rev = "refs/tags/${version}";
34 hash = "sha256-m3x3/MCezA0WjYKBa2F12emMZdwLKi/9bFBf59A4qs8=";
35 };
36
37 # remove pyarrow<14.0.1 vulnerability fix
38 postPatch = ''
39 substituteInPlace src/datasets/features/features.py \
40 --replace "import pyarrow_hotfix" "#import pyarrow_hotfix"
41 '';
42
43 propagatedBuildInputs = [
44 aiohttp
45 dill
46 fsspec
47 huggingface-hub
48 multiprocess
49 numpy
50 packaging
51 pandas
52 pyarrow
53 requests
54 responses
55 tqdm
56 xxhash
57 ] ++ lib.optionals (pythonOlder "3.8") [ importlib-metadata ];
58
59 # Tests require pervasive internet access
60 doCheck = false;
61
62 # Module import will attempt to create a cache directory
63 postFixup = "export HF_MODULES_CACHE=$TMPDIR";
64
65 pythonImportsCheck = [ "datasets" ];
66
67 meta = with lib; {
68 description = "Open-access datasets and evaluation metrics for natural language processing";
69 mainProgram = "datasets-cli";
70 homepage = "https://github.com/huggingface/datasets";
71 changelog = "https://github.com/huggingface/datasets/releases/tag/${version}";
72 license = licenses.asl20;
73 platforms = platforms.unix;
74 maintainers = with maintainers; [ ];
75 };
76}