1{
2 lib,
3 buildPythonPackage,
4 fetchFromGitHub,
5 # propagated build inputs
6 chardet,
7 filetype,
8 lxml,
9 msg-parser,
10 nltk,
11 openpyxl,
12 pandas,
13 pdf2image,
14 pdfminer-six,
15 pillow,
16 pypandoc,
17 python-docx,
18 python-pptx,
19 python-magic,
20 markdown,
21 requests,
22 tabulate,
23 xlrd,
24 # optional-dependencies
25 langdetect,
26 sacremoses,
27 sentencepiece,
28 torch,
29 transformers,
30 unstructured-inference,
31 s3fs,
32 fsspec,
33 adlfs,
34 # , discord-py
35 pygithub,
36 python-gitlab,
37 praw,
38 slack-sdk,
39 wikipedia,
40 google-api-python-client,
41 # , gcsfs
42 elasticsearch8,
43 jq,
44 # , dropboxdrivefs
45 atlassian-python-api,
46 # test dependencies
47 pytestCheckHook,
48 black,
49 coverage,
50 click,
51 freezegun,
52 # , label-studio-sdk
53 mypy,
54 pytest-cov,
55 pytest-mock,
56 vcrpy,
57 grpcio,
58}:
59let
60 version = "0.13.7";
61 optional-dependencies = {
62 huggingflace = [
63 langdetect
64 sacremoses
65 sentencepiece
66 torch
67 transformers
68 ];
69 local-inference = [ unstructured-inference ];
70 s3 = [
71 s3fs
72 fsspec
73 ];
74 azure = [
75 adlfs
76 fsspec
77 ];
78 discord = [ ]; # discord-py
79 github = [ pygithub ];
80 gitlab = [ python-gitlab ];
81 reddit = [ praw ];
82 slack = [ slack-sdk ];
83 wikipedia = [ wikipedia ];
84 google-drive = [ google-api-python-client ];
85 gcs = [ ]; # gcsfs fsspec
86 elasticsearch = [
87 elasticsearch8
88 jq
89 ];
90 dropbox = [ ]; # dropboxdrivefs fsspec
91 confluence = [ atlassian-python-api ];
92 };
93in
94buildPythonPackage {
95 pname = "unstructured";
96 inherit version;
97 format = "setuptools";
98
99 src = fetchFromGitHub {
100 owner = "Unstructured-IO";
101 repo = "unstructured";
102 rev = "refs/tags/${version}";
103 hash = "sha256-Ekfa454mL7isMX79bd/YXPPHnetSzo1Mlg/XJakYyDM=";
104 };
105
106 propagatedBuildInputs = [
107 chardet
108 filetype
109 lxml
110 msg-parser
111 nltk
112 openpyxl
113 pandas
114 pdf2image
115 pdfminer-six
116 pillow
117 pypandoc
118 python-docx
119 python-pptx
120 python-magic
121 markdown
122 requests
123 tabulate
124 xlrd
125 ];
126
127 pythonImportsCheck = [ "unstructured" ];
128
129 # test try to download punkt from nltk
130 # figure out how to make it available to enable the tests
131 doCheck = false;
132
133 nativeCheckInputs = [
134 pytestCheckHook
135 black
136 coverage
137 click
138 freezegun
139 mypy
140 pytest-cov
141 pytest-mock
142 vcrpy
143 grpcio
144 ];
145
146 passthru.optional-dependencies = optional-dependencies;
147
148 meta = with lib; {
149 description = "Open source libraries and APIs to build custom preprocessing pipelines for labeling, training, or production machine learning pipelines";
150 mainProgram = "unstructured-ingest";
151 homepage = "https://github.com/Unstructured-IO/unstructured";
152 changelog = "https://github.com/Unstructured-IO/unstructured/blob/${version}/CHANGELOG.md";
153 license = licenses.asl20;
154 maintainers = with maintainers; [ happysalada ];
155 };
156}