1{ 2 lib, 3 buildPythonPackage, 4 fetchFromGitHub, 5 6 # build-system 7 setuptools, 8 9 # dependencies 10 absl-py, 11 etils, 12 jsonpath-rw, 13 networkx, 14 pandas, 15 pandas-stubs, 16 python-dateutil, 17 rdflib, 18 requests, 19 scipy, 20 tqdm, 21 22 # tests 23 apache-beam, 24 gitpython, 25 pillow, 26 pytestCheckHook, 27 pyyaml, 28 writableTmpDirAsHomeHook, 29}: 30 31buildPythonPackage rec { 32 pname = "mlcroissant"; 33 version = "1.0.17"; 34 pyproject = true; 35 36 src = fetchFromGitHub { 37 owner = "mlcommons"; 38 repo = "croissant"; 39 tag = "v${version}"; 40 hash = "sha256-jiyr8x+YRSsRwOVxDPaWemPqglTKVb5jg4rRzUXd3BE="; 41 }; 42 43 sourceRoot = "${src.name}/python/mlcroissant"; 44 45 build-system = [ 46 setuptools 47 ]; 48 49 dependencies = [ 50 absl-py 51 etils 52 jsonpath-rw 53 networkx 54 pandas 55 pandas-stubs 56 python-dateutil 57 rdflib 58 requests 59 scipy 60 tqdm 61 ] ++ etils.optional-dependencies.epath; 62 63 pythonImportsCheck = [ "mlcroissant" ]; 64 65 nativeCheckInputs = [ 66 apache-beam 67 gitpython 68 pillow 69 pytestCheckHook 70 pyyaml 71 writableTmpDirAsHomeHook 72 ]; 73 74 disabledTests = [ 75 # Requires internet access 76 "test_hermetic_loading_1_1" 77 "test_load_from_huggingface" 78 "test_nonhermetic_loading" 79 "test_nonhermetic_loading_1_0" 80 81 # AssertionError: assert {'records/aud...t32), 22050)'} == {'records/aud...t32), 22050)'} 82 "test_hermetic_loading" 83 84 # AttributeError: 'MaybeReshuffle' object has no attribute 'side_inputs' 85 "test_beam_hermetic_loading" 86 ]; 87 88 meta = { 89 description = "High-level format for machine learning datasets that brings together four rich layers"; 90 homepage = "https://github.com/mlcommons/croissant"; 91 changelog = "https://github.com/mlcommons/croissant/releases/tag/v${version}"; 92 license = lib.licenses.asl20; 93 maintainers = with lib.maintainers; [ GaetanLepage ]; 94 platforms = lib.platforms.all; 95 mainProgram = "mlcroissant"; 96 }; 97}