1{
2 lib,
3 buildPythonPackage,
4 fetchFromGitHub,
5
6 # build system
7 poetry-core,
8
9 # dependencies
10 accelerate,
11 beautifulsoup4,
12 certifi,
13 docling-core,
14 docling-ibm-models,
15 docling-parse,
16 easyocr,
17 filetype,
18 huggingface-hub,
19 lxml,
20 marko,
21 # ocrmac # not yet packaged
22 onnxruntime,
23 openpyxl,
24 pandas,
25 pillow,
26 pluggy,
27 pydantic,
28 pydantic-settings,
29 pylatexenc,
30 pypdfium2,
31 python-docx,
32 python-pptx,
33 rapidocr-onnxruntime,
34 requests,
35 rtree,
36 scipy,
37 tesserocr,
38 tqdm,
39 transformers,
40 typer,
41
42 # optional dependencies
43 # mkdocs-click # not yet packaged
44 mkdocs-jupyter,
45 mkdocs-material,
46 mkdocstrings,
47
48 # tests
49 pytestCheckHook,
50 writableTmpDirAsHomeHook,
51}:
52
53buildPythonPackage rec {
54 pname = "docling";
55 version = "2.31.2";
56 pyproject = true;
57
58 src = fetchFromGitHub {
59 owner = "docling-project";
60 repo = "docling";
61 tag = "v${version}";
62 hash = "sha256-a2PZORT4Umf6AI3yEDDcUD0tm22Ahzm7Dwij/5ZUjNs=";
63 };
64
65 build-system = [
66 poetry-core
67 ];
68
69 dependencies = [
70 accelerate
71 beautifulsoup4
72 certifi
73 docling-core
74 docling-ibm-models
75 docling-parse
76 easyocr
77 filetype
78 huggingface-hub
79 lxml
80 marko
81 # ocrmac # not yet packaged
82 onnxruntime
83 openpyxl
84 pandas
85 pillow
86 pluggy
87 pydantic
88 pydantic-settings
89 pylatexenc
90 pypdfium2
91 python-docx
92 python-pptx
93 rapidocr-onnxruntime
94 requests
95 rtree
96 scipy
97 tesserocr
98 tqdm
99 transformers
100 typer
101 ];
102
103 pythonRelaxDeps = [
104 "pillow"
105 "typer"
106 ];
107
108 optional-dependencies = {
109 ocrmac = [
110 # ocrmac # not yet packaged
111 ];
112 rapidocr = [
113 onnxruntime
114 rapidocr-onnxruntime
115 ];
116 tesserocr = [
117 tesserocr
118 ];
119
120 docs = [
121 # mkdocs-click # not yet packaged
122 mkdocs-jupyter
123 mkdocs-material
124 mkdocstrings
125 # griffle-pydantic
126 ];
127 };
128
129 nativeCheckInputs = [
130 pytestCheckHook
131 writableTmpDirAsHomeHook
132 ];
133
134 pythonImportsCheck = [
135 "docling"
136 ];
137
138 disabledTests = [
139 "test_e2e_pdfs_conversions" # AssertionError: ## TableFormer: Table Structure Understanding with Transf
140 "test_e2e_conversions" # RuntimeError: Tesseract is not available
141
142 # AssertionError
143 # assert doc.export_to_markdown() == pair[1], f"Error in case {idx}"
144 "test_ordered_lists"
145
146 # AssertionError: export to md
147 "test_e2e_html_conversions"
148
149 # AssertionError: assert 'Unordered li...d code block:' == 'Unordered li...d code block:'
150 "test_convert_valid"
151
152 # AssertionError: Markdown file mismatch against groundtruth pftaps057006474.md
153 "test_patent_groundtruth"
154
155 # huggingface_hub.errors.LocalEntryNotFoundError: An error happened
156 "test_cli_convert"
157 "test_code_and_formula_conversion"
158 "test_picture_classifier"
159 "test_convert_path"
160 "test_convert_stream"
161 "test_compare_legacy_output"
162 "test_ocr_coverage_threshold"
163
164 # requires network access
165 "test_page_range"
166 "test_parser_backends"
167
168 # AssertionError: pred_itxt==true_itxt
169 "test_e2e_valid_csv_conversions"
170 ];
171
172 meta = {
173 description = "Get your documents ready for gen AI";
174 homepage = "https://github.com/DS4SD/docling";
175 changelog = "https://github.com/DS4SD/docling/blob/${src.tag}/CHANGELOG.md";
176 license = lib.licenses.mit;
177 maintainers = with lib.maintainers; [ happysalada ];
178 mainProgram = "docling";
179 };
180}