Clone of https://github.com/NixOS/nixpkgs.git (to stress-test knotserver)
at gcc-offload 280 lines 4.6 kB view raw
1{ 2 lib, 3 buildPythonPackage, 4 fetchFromGitHub, 5 6 # core networking and async dependencies 7 anyio, 8 backoff, 9 certifi, 10 httpcore, 11 httpx, 12 h11, 13 nest-asyncio, 14 requests, 15 requests-toolbelt, 16 sniffio, 17 urllib3, 18 19 # core parsing and processing 20 beautifulsoup4, 21 chardet, 22 charset-normalizer, 23 emoji, 24 filetype, 25 html5lib, 26 idna, 27 joblib, 28 # jsonpath-python, 29 nltk, 30 olefile, 31 orderly-set, 32 python-dateutil, 33 # python-iso639, 34 python-magic, 35 # python-oxmsg, 36 rapidfuzz, 37 regex, 38 soupsieve, 39 webencodings, 40 41 # core data handling 42 dataclasses-json, 43 deepdiff, 44 marshmallow, 45 mypy-extensions, 46 packaging, 47 typing-extensions, 48 typing-inspect, 49 50 # core system utilities 51 cffi, 52 cryptography, 53 psutil, 54 pycparser, 55 six, 56 tqdm, 57 wrapt, 58 59 # document format support 60 markdown, 61 pdfminer-six, 62 pdfplumber, 63 # pi-heif, 64 pikepdf, 65 pypandoc, 66 pypdf, 67 python-docx, 68 # unstructured-client, 69 # unstructured-pytesseract, 70 # optional dependencies 71 # csv 72 pytz, 73 tzdata, 74 # markdown 75 importlib-metadata, 76 zipp, 77 # pdf 78 opencv-python, 79 paddlepaddle, 80 pdf2image, 81 # unstructured-paddleocr, 82 # pptx 83 lxml, 84 pillow, 85 python-pptx, 86 xlsxwriter, 87 # xslx 88 et-xmlfile, 89 networkx, 90 numpy, 91 openpyxl, 92 pandas, 93 xlrd, 94 # huggingface 95 langdetect, 96 sacremoses, 97 sentencepiece, 98 torch, 99 transformers, 100 # local-inference 101 unstructured-inference, 102 # test dependencies 103 pytestCheckHook, 104 black, 105 coverage, 106 click, 107 freezegun, 108 # , label-studio-sdk 109 mypy, 110 pytest-cov-stub, 111 pytest-mock, 112 vcrpy, 113 grpcio, 114}: 115let 116 version = "0.16.11"; 117in 118buildPythonPackage { 119 pname = "unstructured"; 120 inherit version; 121 format = "setuptools"; 122 123 src = fetchFromGitHub { 124 owner = "Unstructured-IO"; 125 repo = "unstructured"; 126 rev = "refs/tags/${version}"; 127 hash = "sha256-+I5eXG/ICmYPDTavDnyLlopIvoABjdDwOyfotrNs6qs="; 128 }; 129 130 propagatedBuildInputs = [ 131 # Base dependencies 132 anyio 133 backoff 134 beautifulsoup4 135 certifi 136 cffi 137 chardet 138 charset-normalizer 139 click 140 cryptography 141 dataclasses-json 142 deepdiff 143 emoji 144 filetype 145 h11 146 html5lib 147 httpcore 148 httpx 149 idna 150 joblib 151 # jsonpath-python 152 langdetect 153 lxml 154 marshmallow 155 mypy-extensions 156 nest-asyncio 157 nltk 158 numpy 159 olefile 160 orderly-set 161 packaging 162 psutil 163 pycparser 164 pypdf 165 python-dateutil 166 # python-iso639 167 python-magic 168 # python-oxmsg 169 rapidfuzz 170 regex 171 requests 172 requests-toolbelt 173 six 174 sniffio 175 soupsieve 176 tqdm 177 typing-extensions 178 typing-inspect 179 # unstructured-client 180 urllib3 181 webencodings 182 wrapt 183 ]; 184 185 optional-dependencies = rec { 186 all-docs = csv ++ docx ++ epub ++ pdf ++ req-markdown ++ odt ++ org ++ pptx ++ xlsx; 187 csv = [ 188 numpy 189 pandas 190 python-dateutil 191 pytz 192 tzdata 193 ]; 194 docx = [ 195 lxml 196 python-docx 197 typing-extensions 198 ]; 199 epub = [ pypandoc ]; 200 req-markdown = [ 201 importlib-metadata 202 markdown 203 zipp 204 ]; 205 odt = [ 206 lxml 207 pypandoc 208 python-docx 209 typing-extensions 210 ]; 211 org = [ 212 pypandoc 213 ]; 214 paddleocr = [ 215 opencv-python 216 # paddlepaddle # 3.12 not supported for now 217 pdf2image 218 # unstructured-paddleocr 219 ]; 220 pdf = [ 221 pdf2image 222 pdfminer-six 223 pdfplumber 224 # pi-heif 225 pikepdf 226 pypdf 227 unstructured-inference 228 # unstructured-pytesseract 229 ]; 230 pptx = [ 231 lxml 232 pillow 233 python-pptx 234 xlsxwriter 235 ]; 236 xlsx = [ 237 et-xmlfile 238 networkx 239 numpy 240 openpyxl 241 pandas 242 xlrd 243 ]; 244 huggingface = [ 245 langdetect 246 sacremoses 247 sentencepiece 248 torch 249 transformers 250 ]; 251 }; 252 253 pythonImportsCheck = [ "unstructured" ]; 254 255 # test try to download punkt from nltk 256 # figure out how to make it available to enable the tests 257 doCheck = false; 258 259 nativeCheckInputs = [ 260 pytestCheckHook 261 black 262 coverage 263 click 264 freezegun 265 mypy 266 pytest-cov-stub 267 pytest-mock 268 vcrpy 269 grpcio 270 ]; 271 272 meta = with lib; { 273 description = "Open source libraries and APIs to build custom preprocessing pipelines for labeling, training, or production machine learning pipelines"; 274 mainProgram = "unstructured-ingest"; 275 homepage = "https://github.com/Unstructured-IO/unstructured"; 276 changelog = "https://github.com/Unstructured-IO/unstructured/blob/${version}/CHANGELOG.md"; 277 license = licenses.asl20; 278 maintainers = with maintainers; [ happysalada ]; 279 }; 280}