1{
2 lib,
3 buildPythonPackage,
4 fetchFromGitHub,
5
6 # core networking and async dependencies
7 anyio,
8 backoff,
9 certifi,
10 httpcore,
11 httpx,
12 h11,
13 nest-asyncio,
14 requests,
15 requests-toolbelt,
16 sniffio,
17 urllib3,
18
19 # core parsing and processing
20 beautifulsoup4,
21 chardet,
22 charset-normalizer,
23 emoji,
24 filetype,
25 html5lib,
26 idna,
27 joblib,
28 # jsonpath-python,
29 nltk,
30 olefile,
31 orderly-set,
32 python-dateutil,
33 # python-iso639,
34 python-magic,
35 # python-oxmsg,
36 rapidfuzz,
37 regex,
38 soupsieve,
39 webencodings,
40
41 # core data handling
42 dataclasses-json,
43 deepdiff,
44 marshmallow,
45 mypy-extensions,
46 packaging,
47 typing-extensions,
48 typing-inspect,
49
50 # core system utilities
51 cffi,
52 cryptography,
53 psutil,
54 pycparser,
55 six,
56 tqdm,
57 wrapt,
58
59 # document format support
60 markdown,
61 pdfminer-six,
62 pdfplumber,
63 # pi-heif,
64 pikepdf,
65 pypandoc,
66 pypdf,
67 python-docx,
68 # unstructured-client,
69 # unstructured-pytesseract,
70 # optional dependencies
71 # csv
72 pytz,
73 tzdata,
74 # markdown
75 importlib-metadata,
76 zipp,
77 # pdf
78 opencv-python,
79 paddlepaddle,
80 pdf2image,
81 # unstructured-paddleocr,
82 # pptx
83 lxml,
84 pillow,
85 python-pptx,
86 xlsxwriter,
87 # xslx
88 et-xmlfile,
89 networkx,
90 numpy,
91 openpyxl,
92 pandas,
93 xlrd,
94 # huggingface
95 langdetect,
96 sacremoses,
97 sentencepiece,
98 torch,
99 transformers,
100 # local-inference
101 unstructured-inference,
102 # test dependencies
103 pytestCheckHook,
104 black,
105 coverage,
106 click,
107 freezegun,
108 # , label-studio-sdk
109 mypy,
110 pytest-cov-stub,
111 pytest-mock,
112 vcrpy,
113 grpcio,
114}:
115let
116 version = "0.16.11";
117in
118buildPythonPackage {
119 pname = "unstructured";
120 inherit version;
121 format = "setuptools";
122
123 src = fetchFromGitHub {
124 owner = "Unstructured-IO";
125 repo = "unstructured";
126 rev = "refs/tags/${version}";
127 hash = "sha256-+I5eXG/ICmYPDTavDnyLlopIvoABjdDwOyfotrNs6qs=";
128 };
129
130 propagatedBuildInputs = [
131 # Base dependencies
132 anyio
133 backoff
134 beautifulsoup4
135 certifi
136 cffi
137 chardet
138 charset-normalizer
139 click
140 cryptography
141 dataclasses-json
142 deepdiff
143 emoji
144 filetype
145 h11
146 html5lib
147 httpcore
148 httpx
149 idna
150 joblib
151 # jsonpath-python
152 langdetect
153 lxml
154 marshmallow
155 mypy-extensions
156 nest-asyncio
157 nltk
158 numpy
159 olefile
160 orderly-set
161 packaging
162 psutil
163 pycparser
164 pypdf
165 python-dateutil
166 # python-iso639
167 python-magic
168 # python-oxmsg
169 rapidfuzz
170 regex
171 requests
172 requests-toolbelt
173 six
174 sniffio
175 soupsieve
176 tqdm
177 typing-extensions
178 typing-inspect
179 # unstructured-client
180 urllib3
181 webencodings
182 wrapt
183 ];
184
185 optional-dependencies = rec {
186 all-docs = csv ++ docx ++ epub ++ pdf ++ req-markdown ++ odt ++ org ++ pptx ++ xlsx;
187 csv = [
188 numpy
189 pandas
190 python-dateutil
191 pytz
192 tzdata
193 ];
194 docx = [
195 lxml
196 python-docx
197 typing-extensions
198 ];
199 epub = [ pypandoc ];
200 req-markdown = [
201 importlib-metadata
202 markdown
203 zipp
204 ];
205 odt = [
206 lxml
207 pypandoc
208 python-docx
209 typing-extensions
210 ];
211 org = [
212 pypandoc
213 ];
214 paddleocr = [
215 opencv-python
216 # paddlepaddle # 3.12 not supported for now
217 pdf2image
218 # unstructured-paddleocr
219 ];
220 pdf = [
221 pdf2image
222 pdfminer-six
223 pdfplumber
224 # pi-heif
225 pikepdf
226 pypdf
227 unstructured-inference
228 # unstructured-pytesseract
229 ];
230 pptx = [
231 lxml
232 pillow
233 python-pptx
234 xlsxwriter
235 ];
236 xlsx = [
237 et-xmlfile
238 networkx
239 numpy
240 openpyxl
241 pandas
242 xlrd
243 ];
244 huggingface = [
245 langdetect
246 sacremoses
247 sentencepiece
248 torch
249 transformers
250 ];
251 };
252
253 pythonImportsCheck = [ "unstructured" ];
254
255 # test try to download punkt from nltk
256 # figure out how to make it available to enable the tests
257 doCheck = false;
258
259 nativeCheckInputs = [
260 pytestCheckHook
261 black
262 coverage
263 click
264 freezegun
265 mypy
266 pytest-cov-stub
267 pytest-mock
268 vcrpy
269 grpcio
270 ];
271
272 meta = with lib; {
273 description = "Open source libraries and APIs to build custom preprocessing pipelines for labeling, training, or production machine learning pipelines";
274 mainProgram = "unstructured-ingest";
275 homepage = "https://github.com/Unstructured-IO/unstructured";
276 changelog = "https://github.com/Unstructured-IO/unstructured/blob/${version}/CHANGELOG.md";
277 license = licenses.asl20;
278 maintainers = with maintainers; [ happysalada ];
279 };
280}