1{
2 lib,
3 buildPythonPackage,
4 fetchFromGitHub,
5 fetchpatch,
6
7 # dependencies
8 array-record,
9 dill,
10 dm-tree,
11 future,
12 immutabledict,
13 importlib-resources,
14 numpy,
15 promise,
16 protobuf,
17 psutil,
18 requests,
19 simple-parsing,
20 six,
21 tensorflow-metadata,
22 termcolor,
23 tqdm,
24
25 # tests
26 apache-beam,
27 beautifulsoup4,
28 click,
29 datasets,
30 ffmpeg,
31 imagemagick,
32 jax,
33 jaxlib,
34 jinja2,
35 langdetect,
36 lxml,
37 matplotlib,
38 mlcroissant,
39 mwparserfromhell,
40 mwxml,
41 networkx,
42 nltk,
43 opencv4,
44 pandas,
45 pillow,
46 pycocotools,
47 pydub,
48 pytest-xdist,
49 pytestCheckHook,
50 scikit-image,
51 scipy,
52 sortedcontainers,
53 tensorflow,
54 tifffile,
55 zarr,
56}:
57
58buildPythonPackage rec {
59 pname = "tensorflow-datasets";
60 version = "4.9.8";
61 pyproject = true;
62
63 src = fetchFromGitHub {
64 owner = "tensorflow";
65 repo = "datasets";
66 tag = "v${version}";
67 hash = "sha256-nqveZ+8b0f5sGIn6WufKeA37yEsZjzhCIbCfwMZ9JOM=";
68 };
69
70 patches = [
71 # mlmlcroissant uses encoding_formats, not encoding_formats.
72 # Backport https://github.com/tensorflow/datasets/pull/11037 until released.
73 (fetchpatch {
74 url = "https://github.com/tensorflow/datasets/commit/92cbcff725a1036569a515cc3356aa8480740451.patch";
75 hash = "sha256-2hnMvQP83+eAJllce19aHujcoWQzUz3+LsasWCo4BtM=";
76 })
77 ];
78
79 dependencies = [
80 array-record
81 dill
82 dm-tree
83 future
84 immutabledict
85 importlib-resources
86 numpy
87 promise
88 protobuf
89 psutil
90 requests
91 simple-parsing
92 six
93 tensorflow-metadata
94 termcolor
95 tqdm
96 ];
97
98 pythonImportsCheck = [ "tensorflow_datasets" ];
99
100 nativeCheckInputs = [
101 apache-beam
102 beautifulsoup4
103 click
104 datasets
105 ffmpeg
106 imagemagick
107 jax
108 jaxlib
109 jinja2
110 langdetect
111 lxml
112 matplotlib
113 mlcroissant
114 mwparserfromhell
115 mwxml
116 networkx
117 nltk
118 opencv4
119 pandas
120 pillow
121 pycocotools
122 pydub
123 pytest-xdist
124 pytestCheckHook
125 scikit-image
126 scipy
127 sortedcontainers
128 tensorflow
129 tifffile
130 zarr
131 ];
132
133 pytestFlagsArray = [
134 # AttributeError: 'NoneType' object has no attribute 'Table'
135 "--deselect=tensorflow_datasets/core/file_adapters_test.py::test_read_write"
136 "--deselect=tensorflow_datasets/text/c4_wsrs/c4_wsrs_test.py::C4WSRSTest"
137 ];
138
139 disabledTestPaths = [
140 # Sandbox violations: network access, filesystem write attempts outside of build dir, ...
141 "tensorflow_datasets/core/dataset_builder_test.py"
142 "tensorflow_datasets/core/dataset_info_test.py"
143 "tensorflow_datasets/core/features/features_test.py"
144 "tensorflow_datasets/core/github_api/github_path_test.py"
145 "tensorflow_datasets/core/registered_test.py"
146 "tensorflow_datasets/core/utils/gcs_utils_test.py"
147 "tensorflow_datasets/import_without_tf_test.py"
148 "tensorflow_datasets/proto/build_tf_proto_test.py"
149 "tensorflow_datasets/scripts/cli/build_test.py"
150 "tensorflow_datasets/datasets/imagenet2012_corrupted/imagenet2012_corrupted_dataset_builder_test.py"
151
152 # Requires `pretty_midi` which is not packaged in `nixpkgs`.
153 "tensorflow_datasets/audio/groove.py"
154 "tensorflow_datasets/datasets/groove/groove_dataset_builder_test.py"
155
156 # Requires `crepe` which is not packaged in `nixpkgs`.
157 "tensorflow_datasets/audio/nsynth.py"
158 "tensorflow_datasets/datasets/nsynth/nsynth_dataset_builder_test.py"
159
160 # Requires `conllu` which is not packaged in `nixpkgs`.
161 "tensorflow_datasets/core/dataset_builders/conll/conllu_dataset_builder_test.py"
162 "tensorflow_datasets/datasets/universal_dependencies/universal_dependencies_dataset_builder_test.py"
163 "tensorflow_datasets/datasets/xtreme_pos/xtreme_pos_dataset_builder_test.py"
164
165 # Requires `gcld3` and `pretty_midi` which are not packaged in `nixpkgs`.
166 "tensorflow_datasets/core/lazy_imports_lib_test.py"
167
168 # AttributeError: 'NoneType' object has no attribute 'Table'
169 "tensorflow_datasets/core/dataset_builder_beam_test.py"
170 "tensorflow_datasets/core/dataset_builders/adhoc_builder_test.py"
171 "tensorflow_datasets/core/split_builder_test.py"
172 "tensorflow_datasets/core/writer_test.py"
173
174 # Requires `tensorflow_io` which is not packaged in `nixpkgs`.
175 "tensorflow_datasets/core/features/audio_feature_test.py"
176 "tensorflow_datasets/image/lsun_test.py"
177
178 # Fails with `TypeError: Constant constructor takes either 0 or 2 positional arguments`
179 # deep in TF AutoGraph. Doesn't reproduce in Docker with Ubuntu 22.04 => might be related
180 # to the differences in some of the dependencies?
181 "tensorflow_datasets/rl_unplugged/rlu_atari/rlu_atari_test.py"
182
183 # Fails with `ValueError: setting an array element with a sequence`
184 "tensorflow_datasets/core/dataset_utils_test.py"
185 "tensorflow_datasets/core/features/sequence_feature_test.py"
186
187 # Requires `tensorflow_docs` which is not packaged in `nixpkgs` and the test is for documentation anyway.
188 "tensorflow_datasets/scripts/documentation/build_api_docs_test.py"
189
190 # Not a test, should not be executed.
191 "tensorflow_datasets/testing/test_utils.py"
192
193 # Require `gcld3` and `nltk.punkt` which are not packaged in `nixpkgs`.
194 "tensorflow_datasets/text/c4_test.py"
195 "tensorflow_datasets/text/c4_utils_test.py"
196 ];
197
198 meta = {
199 description = "Library of datasets ready to use with TensorFlow";
200 homepage = "https://www.tensorflow.org/datasets/overview";
201 changelog = "https://github.com/tensorflow/datasets/releases/tag/v${version}";
202 license = lib.licenses.asl20;
203 maintainers = with lib.maintainers; [ ndl ];
204 };
205}