Clone of https://github.com/NixOS/nixpkgs.git (to stress-test knotserver)
1{ 2 lib, 3 buildPythonPackage, 4 fetchFromGitHub, 5 6 # build system 7 poetry-core, 8 9 # dependencies 10 accelerate, 11 beautifulsoup4, 12 certifi, 13 docling-core, 14 docling-ibm-models, 15 docling-parse, 16 easyocr, 17 filetype, 18 huggingface-hub, 19 lxml, 20 marko, 21 # ocrmac # not yet packaged 22 onnxruntime, 23 openpyxl, 24 pandas, 25 pillow, 26 pluggy, 27 pydantic, 28 pydantic-settings, 29 pylatexenc, 30 pypdfium2, 31 python-docx, 32 python-pptx, 33 rapidocr-onnxruntime, 34 requests, 35 rtree, 36 scipy, 37 tesserocr, 38 tqdm, 39 transformers, 40 typer, 41 42 # optional dependencies 43 # mkdocs-click # not yet packaged 44 mkdocs-jupyter, 45 mkdocs-material, 46 mkdocstrings, 47 48 # tests 49 pytestCheckHook, 50 writableTmpDirAsHomeHook, 51}: 52 53buildPythonPackage rec { 54 pname = "docling"; 55 version = "2.42.0"; 56 pyproject = true; 57 58 src = fetchFromGitHub { 59 owner = "docling-project"; 60 repo = "docling"; 61 tag = "v${version}"; 62 hash = "sha256-9HUomW55Yg5N7u3Wb4imzRUYECeGkb3lkHPLEGzuAnA="; 63 }; 64 65 build-system = [ 66 poetry-core 67 ]; 68 69 dependencies = [ 70 accelerate 71 beautifulsoup4 72 certifi 73 docling-core 74 docling-ibm-models 75 docling-parse 76 easyocr 77 filetype 78 huggingface-hub 79 lxml 80 marko 81 # ocrmac # not yet packaged 82 onnxruntime 83 openpyxl 84 pandas 85 pillow 86 pluggy 87 pydantic 88 pydantic-settings 89 pylatexenc 90 pypdfium2 91 python-docx 92 python-pptx 93 rapidocr-onnxruntime 94 requests 95 rtree 96 scipy 97 tesserocr 98 tqdm 99 transformers 100 typer 101 ]; 102 103 pythonRelaxDeps = [ 104 "pillow" 105 ]; 106 107 optional-dependencies = { 108 ocrmac = [ 109 # ocrmac # not yet packaged 110 ]; 111 rapidocr = [ 112 onnxruntime 113 rapidocr-onnxruntime 114 ]; 115 tesserocr = [ 116 tesserocr 117 ]; 118 119 docs = [ 120 # mkdocs-click # not yet packaged 121 mkdocs-jupyter 122 mkdocs-material 123 mkdocstrings 124 # griffle-pydantic 125 ]; 126 }; 127 128 nativeCheckInputs = [ 129 pytestCheckHook 130 writableTmpDirAsHomeHook 131 ]; 132 133 pythonImportsCheck = [ 134 "docling" 135 ]; 136 137 disabledTests = [ 138 "test_e2e_pdfs_conversions" # AssertionError: ## TableFormer: Table Structure Understanding with Transf 139 "test_e2e_conversions" # RuntimeError: Tesseract is not available 140 141 # AssertionError 142 # assert doc.export_to_markdown() == pair[1], f"Error in case {idx}" 143 "test_ordered_lists" 144 145 # AssertionError: export to md 146 "test_e2e_html_conversions" 147 148 # AssertionError: assert 'Unordered li...d code block:' == 'Unordered li...d code block:' 149 "test_convert_valid" 150 151 # AssertionError: Markdown file mismatch against groundtruth pftaps057006474.md 152 "test_patent_groundtruth" 153 154 # huggingface_hub.errors.LocalEntryNotFoundError: An error happened 155 "test_cli_convert" 156 "test_code_and_formula_conversion" 157 "test_picture_classifier" 158 "test_convert_path" 159 "test_convert_stream" 160 "test_compare_legacy_output" 161 "test_ocr_coverage_threshold" 162 "test_formula_conversion_with_page_range" 163 164 # requires network access 165 "test_page_range" 166 "test_parser_backends" 167 "test_confidence" 168 "test_e2e_webp_conversions" 169 "test_asr_pipeline_conversion" 170 171 # AssertionError: pred_itxt==true_itxt 172 "test_e2e_valid_csv_conversions" 173 ]; 174 175 meta = { 176 description = "Get your documents ready for gen AI"; 177 homepage = "https://github.com/DS4SD/docling"; 178 changelog = "https://github.com/DS4SD/docling/blob/${src.tag}/CHANGELOG.md"; 179 license = lib.licenses.mit; 180 maintainers = with lib.maintainers; [ happysalada ]; 181 mainProgram = "docling"; 182 }; 183}