1{
2 lib,
3 stdenvNoCC,
4 fetchFromGitHub,
5 python3,
6 makeWrapper,
7 nix-update-script,
8}:
9let
10 pythonEnv = python3.withPackages (
11 packages:
12 with packages;
13 [
14 aiofiles
15 annotated-types
16 antlr4-python3-runtime
17 anyio
18 backoff
19 beautifulsoup4
20 cachetools
21 certifi
22 cffi
23 chardet
24 charset-normalizer
25 click
26 coloredlogs
27 contourpy
28 cryptography
29 cycler
30 dataclasses-json
31 deprecated
32 effdet
33 emoji
34 et-xmlfile
35 eval-type-backport
36 fastapi
37 filelock
38 filetype
39 flatbuffers
40 fonttools
41 fsspec
42 google-api-core
43 google-auth
44 google-cloud-vision
45 googleapis-common-protos
46 grpcio
47 grpcio-status
48 h11
49 html5lib
50 httpcore
51 httpx
52 huggingface-hub
53 humanfriendly
54 idna
55 iopath
56 jinja2
57 joblib
58 jsonpath
59 kiwisolver
60 langdetect
61 layoutparser
62 lxml
63 markdown
64 markupsafe
65 marshmallow
66 matplotlib
67 mpmath
68 mypy-extensions
69 nest-asyncio
70 networkx
71 nltk
72 numpy
73 olefile
74 omegaconf
75 onnx
76 onnxruntime
77 opencv-python
78 openpyxl
79 packaging
80 pandas
81 pdf2image
82 pdfminer-six
83 pdfplumber
84 # pi-heif
85 pikepdf
86 pillow
87 portalocker
88 proto-plus
89 protobuf
90 psutil
91 pyasn1
92 pyasn1-modules
93 pycocotools
94 pycparser
95 pycryptodome
96 pydantic
97 pydantic-core
98 pypandoc
99 pyparsing
100 pypdf
101 pypdfium2
102 python-dateutil
103 python-docx
104 # python-iso639
105 python-magic
106 python-multipart
107 # python-oxmsg
108 python-pptx
109 pytz
110 pyyaml
111 rapidfuzz
112 ratelimit
113 regex
114 requests
115 requests-toolbelt
116 rsa
117 safetensors
118 scipy
119 six
120 sniffio
121 soupsieve
122 starlette
123 sympy
124 timm
125 tokenizers
126 torch
127 torchvision
128 tqdm
129 transformers
130 typing-extensions
131 typing-inspect
132 tzdata
133 unstructured
134 # unstructured-client
135 unstructured-inference
136 # unstructured-pytesseract
137 urllib3
138 uvicorn
139 webencodings
140 wrapt
141 xlrd
142 xlsxwriter
143 ]
144 ++ google-api-core.optional-dependencies.grpc
145 ++ unstructured.optional-dependencies.all-docs
146 );
147 version = "0.0.89";
148 unstructured_api_nltk_data = python3.pkgs.nltk.dataDir (d: [
149 d.punkt
150 d.averaged-perceptron-tagger
151 ]);
152in
153stdenvNoCC.mkDerivation {
154 pname = "unstructured-api";
155 inherit version;
156
157 src = fetchFromGitHub {
158 owner = "Unstructured-IO";
159 repo = "unstructured-api";
160 rev = version;
161 hash = "sha256-FxWOR13wZwowZny2t4Frwl+cLMv+6nkHxQm9Xc4Y9Kw=";
162 };
163
164 nativeBuildInputs = [ makeWrapper ];
165
166 installPhase = ''
167 runHook preInstall
168
169 mkdir -p $out $out/bin $out/lib
170 cp -r . $out/lib
171
172 makeWrapper ${pythonEnv}/bin/uvicorn $out/bin/unstructured-api \
173 --set NLTK_DATA ${unstructured_api_nltk_data} \
174 --prefix PYTHONPATH : $out/lib \
175 --add-flags "prepline_general.api.app:app"
176
177 runHook postInstall
178 '';
179
180 passthru = {
181 updateScript = nix-update-script { };
182 };
183
184 meta = {
185 description = "Open-source toolkit designed to make it easy to prepare unstructured data like PDFs, HTML and Word Documents for downstream data science tasks";
186 homepage = "https://github.com/Unstructured-IO/unstructured-api";
187 changelog = "https://github.com/Unstructured-IO/unstructured-api/releases/tag/${version}";
188 license = lib.licenses.asl20;
189 maintainers = with lib.maintainers; [ happysalada ];
190 };
191}