1{ lib
2, buildPythonPackage
3, fetchFromGitHub
4# propagated build inputs
5, chardet
6, filetype
7, lxml
8, msg-parser
9, nltk
10, openpyxl
11, pandas
12, pdf2image
13, pdfminer-six
14, pillow
15, pypandoc
16, python-docx
17, python-pptx
18, python-magic
19, markdown
20, requests
21, tabulate
22, xlrd
23# optional-dependencies
24, langdetect
25, sacremoses
26, sentencepiece
27, torch
28, transformers
29, unstructured-inference
30, s3fs
31, fsspec
32, adlfs
33# , discord-py
34, pygithub
35, python-gitlab
36, praw
37, slack-sdk
38, wikipedia
39, google-api-python-client
40# , gcsfs
41, elasticsearch8
42, jq
43# , dropboxdrivefs
44, atlassian-python-api
45# test dependencies
46, pytestCheckHook
47, black
48, coverage
49, click
50, freezegun
51# , label-studio-sdk
52, mypy
53, pytest-cov
54, pytest-mock
55, vcrpy
56, grpcio
57}:
58let
59 version = "0.10.30";
60 optional-dependencies = {
61 huggingflace = [
62 langdetect
63 sacremoses
64 sentencepiece
65 torch
66 transformers
67 ];
68 local-inference = [ unstructured-inference ];
69 s3 = [ s3fs fsspec ];
70 azure = [ adlfs fsspec ];
71 discord = [ ]; # discord-py
72 github = [ pygithub ];
73 gitlab = [ python-gitlab ];
74 reddit = [ praw ];
75 slack = [ slack-sdk ];
76 wikipedia = [ wikipedia ];
77 google-drive = [ google-api-python-client ];
78 gcs = []; # gcsfs fsspec
79 elasticsearch = [ elasticsearch8 jq ];
80 dropbox = []; # dropboxdrivefs fsspec
81 confluence = [ atlassian-python-api ];
82 };
83in
84buildPythonPackage {
85 pname = "unstructured";
86 inherit version;
87 format = "setuptools";
88
89 src = fetchFromGitHub {
90 owner = "Unstructured-IO";
91 repo = "unstructured";
92 rev = "refs/tags/${version}";
93 hash = "sha256-RaVg4XNmh1S5G1CHQiME7t/BmK0MI9M8wI2YTKjpqzM=";
94 };
95
96 propagatedBuildInputs = [
97 chardet
98 filetype
99 lxml
100 msg-parser
101 nltk
102 openpyxl
103 pandas
104 pdf2image
105 pdfminer-six
106 pillow
107 pypandoc
108 python-docx
109 python-pptx
110 python-magic
111 markdown
112 requests
113 tabulate
114 xlrd
115 ];
116
117 pythonImportsCheck = [ "unstructured" ];
118
119 # test try to download punkt from nltk
120 # figure out how to make it available to enable the tests
121 doCheck = false;
122
123 nativeCheckInputs = [
124 pytestCheckHook
125 black
126 coverage
127 click
128 freezegun
129 mypy
130 pytest-cov
131 pytest-mock
132 vcrpy
133 grpcio
134 ];
135
136 passthru.optional-dependencies = optional-dependencies;
137
138 meta = with lib; {
139 description = "Open source libraries and APIs to build custom preprocessing pipelines for labeling, training, or production machine learning pipelines";
140 homepage = "https://github.com/Unstructured-IO/unstructured";
141 changelog = "https://github.com/Unstructured-IO/unstructured/blob/${version}/CHANGELOG.md";
142 license = licenses.asl20;
143 maintainers = with maintainers; [ happysalada ];
144 };
145}