1{
2 lib,
3 buildPythonPackage,
4 fetchFromGitHub,
5
6 # build-system
7 setuptools,
8
9 # dependencies
10 absl-py,
11 etils,
12 jsonpath-rw,
13 networkx,
14 pandas,
15 pandas-stubs,
16 python-dateutil,
17 rdflib,
18 requests,
19 scipy,
20 tqdm,
21
22 # tests
23 apache-beam,
24 gitpython,
25 pillow,
26 pytestCheckHook,
27 pyyaml,
28 writableTmpDirAsHomeHook,
29}:
30
31buildPythonPackage rec {
32 pname = "mlcroissant";
33 version = "1.0.17";
34 pyproject = true;
35
36 src = fetchFromGitHub {
37 owner = "mlcommons";
38 repo = "croissant";
39 tag = "v${version}";
40 hash = "sha256-jiyr8x+YRSsRwOVxDPaWemPqglTKVb5jg4rRzUXd3BE=";
41 };
42
43 sourceRoot = "${src.name}/python/mlcroissant";
44
45 build-system = [
46 setuptools
47 ];
48
49 dependencies = [
50 absl-py
51 etils
52 jsonpath-rw
53 networkx
54 pandas
55 pandas-stubs
56 python-dateutil
57 rdflib
58 requests
59 scipy
60 tqdm
61 ] ++ etils.optional-dependencies.epath;
62
63 pythonImportsCheck = [ "mlcroissant" ];
64
65 nativeCheckInputs = [
66 apache-beam
67 gitpython
68 pillow
69 pytestCheckHook
70 pyyaml
71 writableTmpDirAsHomeHook
72 ];
73
74 disabledTests = [
75 # Requires internet access
76 "test_hermetic_loading_1_1"
77 "test_load_from_huggingface"
78 "test_nonhermetic_loading"
79 "test_nonhermetic_loading_1_0"
80
81 # AssertionError: assert {'records/aud...t32), 22050)'} == {'records/aud...t32), 22050)'}
82 "test_hermetic_loading"
83
84 # AttributeError: 'MaybeReshuffle' object has no attribute 'side_inputs'
85 "test_beam_hermetic_loading"
86 ];
87
88 meta = {
89 description = "High-level format for machine learning datasets that brings together four rich layers";
90 homepage = "https://github.com/mlcommons/croissant";
91 changelog = "https://github.com/mlcommons/croissant/releases/tag/v${version}";
92 license = lib.licenses.asl20;
93 maintainers = with lib.maintainers; [ GaetanLepage ];
94 platforms = lib.platforms.all;
95 mainProgram = "mlcroissant";
96 };
97}