at 25.11-pre 3.4 kB view raw
1{ 2 lib, 3 buildPythonPackage, 4 fetchFromGitHub, 5 6 # build system 7 poetry-core, 8 9 # dependencies 10 accelerate, 11 beautifulsoup4, 12 certifi, 13 docling-core, 14 docling-ibm-models, 15 docling-parse, 16 easyocr, 17 filetype, 18 huggingface-hub, 19 lxml, 20 marko, 21 # ocrmac # not yet packaged 22 onnxruntime, 23 openpyxl, 24 pandas, 25 pillow, 26 pluggy, 27 pydantic, 28 pydantic-settings, 29 pylatexenc, 30 pypdfium2, 31 python-docx, 32 python-pptx, 33 rapidocr-onnxruntime, 34 requests, 35 rtree, 36 scipy, 37 tesserocr, 38 tqdm, 39 transformers, 40 typer, 41 42 # optional dependencies 43 # mkdocs-click # not yet packaged 44 mkdocs-jupyter, 45 mkdocs-material, 46 mkdocstrings, 47 48 # tests 49 pytestCheckHook, 50 writableTmpDirAsHomeHook, 51}: 52 53buildPythonPackage rec { 54 pname = "docling"; 55 version = "2.31.2"; 56 pyproject = true; 57 58 src = fetchFromGitHub { 59 owner = "docling-project"; 60 repo = "docling"; 61 tag = "v${version}"; 62 hash = "sha256-a2PZORT4Umf6AI3yEDDcUD0tm22Ahzm7Dwij/5ZUjNs="; 63 }; 64 65 build-system = [ 66 poetry-core 67 ]; 68 69 dependencies = [ 70 accelerate 71 beautifulsoup4 72 certifi 73 docling-core 74 docling-ibm-models 75 docling-parse 76 easyocr 77 filetype 78 huggingface-hub 79 lxml 80 marko 81 # ocrmac # not yet packaged 82 onnxruntime 83 openpyxl 84 pandas 85 pillow 86 pluggy 87 pydantic 88 pydantic-settings 89 pylatexenc 90 pypdfium2 91 python-docx 92 python-pptx 93 rapidocr-onnxruntime 94 requests 95 rtree 96 scipy 97 tesserocr 98 tqdm 99 transformers 100 typer 101 ]; 102 103 pythonRelaxDeps = [ 104 "pillow" 105 "typer" 106 ]; 107 108 optional-dependencies = { 109 ocrmac = [ 110 # ocrmac # not yet packaged 111 ]; 112 rapidocr = [ 113 onnxruntime 114 rapidocr-onnxruntime 115 ]; 116 tesserocr = [ 117 tesserocr 118 ]; 119 120 docs = [ 121 # mkdocs-click # not yet packaged 122 mkdocs-jupyter 123 mkdocs-material 124 mkdocstrings 125 # griffle-pydantic 126 ]; 127 }; 128 129 nativeCheckInputs = [ 130 pytestCheckHook 131 writableTmpDirAsHomeHook 132 ]; 133 134 pythonImportsCheck = [ 135 "docling" 136 ]; 137 138 disabledTests = [ 139 "test_e2e_pdfs_conversions" # AssertionError: ## TableFormer: Table Structure Understanding with Transf 140 "test_e2e_conversions" # RuntimeError: Tesseract is not available 141 142 # AssertionError 143 # assert doc.export_to_markdown() == pair[1], f"Error in case {idx}" 144 "test_ordered_lists" 145 146 # AssertionError: export to md 147 "test_e2e_html_conversions" 148 149 # AssertionError: assert 'Unordered li...d code block:' == 'Unordered li...d code block:' 150 "test_convert_valid" 151 152 # AssertionError: Markdown file mismatch against groundtruth pftaps057006474.md 153 "test_patent_groundtruth" 154 155 # huggingface_hub.errors.LocalEntryNotFoundError: An error happened 156 "test_cli_convert" 157 "test_code_and_formula_conversion" 158 "test_picture_classifier" 159 "test_convert_path" 160 "test_convert_stream" 161 "test_compare_legacy_output" 162 "test_ocr_coverage_threshold" 163 164 # requires network access 165 "test_page_range" 166 "test_parser_backends" 167 168 # AssertionError: pred_itxt==true_itxt 169 "test_e2e_valid_csv_conversions" 170 ]; 171 172 meta = { 173 description = "Get your documents ready for gen AI"; 174 homepage = "https://github.com/DS4SD/docling"; 175 changelog = "https://github.com/DS4SD/docling/blob/${src.tag}/CHANGELOG.md"; 176 license = lib.licenses.mit; 177 maintainers = with lib.maintainers; [ happysalada ]; 178 mainProgram = "docling"; 179 }; 180}