1{
2 lib,
3 buildPythonPackage,
4 fetchFromGitHub,
5
6 # build system
7 poetry-core,
8
9 # dependencies
10 accelerate,
11 beautifulsoup4,
12 certifi,
13 docling-core,
14 docling-ibm-models,
15 docling-parse,
16 easyocr,
17 filetype,
18 huggingface-hub,
19 lxml,
20 marko,
21 # ocrmac # not yet packaged
22 onnxruntime,
23 openpyxl,
24 pandas,
25 pillow,
26 pluggy,
27 pydantic,
28 pydantic-settings,
29 pylatexenc,
30 pypdfium2,
31 python-docx,
32 python-pptx,
33 rapidocr-onnxruntime,
34 requests,
35 rtree,
36 scipy,
37 tesserocr,
38 tqdm,
39 transformers,
40 typer,
41
42 # optional dependencies
43 # mkdocs-click # not yet packaged
44 mkdocs-jupyter,
45 mkdocs-material,
46 mkdocstrings,
47
48 # tests
49 pytestCheckHook,
50 writableTmpDirAsHomeHook,
51}:
52
53buildPythonPackage rec {
54 pname = "docling";
55 version = "2.42.0";
56 pyproject = true;
57
58 src = fetchFromGitHub {
59 owner = "docling-project";
60 repo = "docling";
61 tag = "v${version}";
62 hash = "sha256-9HUomW55Yg5N7u3Wb4imzRUYECeGkb3lkHPLEGzuAnA=";
63 };
64
65 build-system = [
66 poetry-core
67 ];
68
69 dependencies = [
70 accelerate
71 beautifulsoup4
72 certifi
73 docling-core
74 docling-ibm-models
75 docling-parse
76 easyocr
77 filetype
78 huggingface-hub
79 lxml
80 marko
81 # ocrmac # not yet packaged
82 onnxruntime
83 openpyxl
84 pandas
85 pillow
86 pluggy
87 pydantic
88 pydantic-settings
89 pylatexenc
90 pypdfium2
91 python-docx
92 python-pptx
93 rapidocr-onnxruntime
94 requests
95 rtree
96 scipy
97 tesserocr
98 tqdm
99 transformers
100 typer
101 ];
102
103 pythonRelaxDeps = [
104 "pillow"
105 ];
106
107 optional-dependencies = {
108 ocrmac = [
109 # ocrmac # not yet packaged
110 ];
111 rapidocr = [
112 onnxruntime
113 rapidocr-onnxruntime
114 ];
115 tesserocr = [
116 tesserocr
117 ];
118
119 docs = [
120 # mkdocs-click # not yet packaged
121 mkdocs-jupyter
122 mkdocs-material
123 mkdocstrings
124 # griffle-pydantic
125 ];
126 };
127
128 nativeCheckInputs = [
129 pytestCheckHook
130 writableTmpDirAsHomeHook
131 ];
132
133 pythonImportsCheck = [
134 "docling"
135 ];
136
137 disabledTests = [
138 "test_e2e_pdfs_conversions" # AssertionError: ## TableFormer: Table Structure Understanding with Transf
139 "test_e2e_conversions" # RuntimeError: Tesseract is not available
140
141 # AssertionError
142 # assert doc.export_to_markdown() == pair[1], f"Error in case {idx}"
143 "test_ordered_lists"
144
145 # AssertionError: export to md
146 "test_e2e_html_conversions"
147
148 # AssertionError: assert 'Unordered li...d code block:' == 'Unordered li...d code block:'
149 "test_convert_valid"
150
151 # AssertionError: Markdown file mismatch against groundtruth pftaps057006474.md
152 "test_patent_groundtruth"
153
154 # huggingface_hub.errors.LocalEntryNotFoundError: An error happened
155 "test_cli_convert"
156 "test_code_and_formula_conversion"
157 "test_picture_classifier"
158 "test_convert_path"
159 "test_convert_stream"
160 "test_compare_legacy_output"
161 "test_ocr_coverage_threshold"
162 "test_formula_conversion_with_page_range"
163
164 # requires network access
165 "test_page_range"
166 "test_parser_backends"
167 "test_confidence"
168 "test_e2e_webp_conversions"
169 "test_asr_pipeline_conversion"
170
171 # AssertionError: pred_itxt==true_itxt
172 "test_e2e_valid_csv_conversions"
173 ];
174
175 meta = {
176 description = "Get your documents ready for gen AI";
177 homepage = "https://github.com/DS4SD/docling";
178 changelog = "https://github.com/DS4SD/docling/blob/${src.tag}/CHANGELOG.md";
179 license = lib.licenses.mit;
180 maintainers = with lib.maintainers; [ happysalada ];
181 mainProgram = "docling";
182 };
183}