1{ 2 lib, 3 buildPythonPackage, 4 fetchFromGitHub, 5 # propagated build inputs 6 chardet, 7 filetype, 8 lxml, 9 msg-parser, 10 nltk, 11 openpyxl, 12 pandas, 13 pdf2image, 14 pdfminer-six, 15 pillow, 16 pypandoc, 17 python-docx, 18 python-pptx, 19 python-magic, 20 markdown, 21 requests, 22 tabulate, 23 xlrd, 24 # optional-dependencies 25 langdetect, 26 sacremoses, 27 sentencepiece, 28 torch, 29 transformers, 30 unstructured-inference, 31 s3fs, 32 fsspec, 33 adlfs, 34 # , discord-py 35 pygithub, 36 python-gitlab, 37 praw, 38 slack-sdk, 39 wikipedia, 40 google-api-python-client, 41 # , gcsfs 42 elasticsearch8, 43 jq, 44 # , dropboxdrivefs 45 atlassian-python-api, 46 # test dependencies 47 pytestCheckHook, 48 black, 49 coverage, 50 click, 51 freezegun, 52 # , label-studio-sdk 53 mypy, 54 pytest-cov, 55 pytest-mock, 56 vcrpy, 57 grpcio, 58}: 59let 60 version = "0.13.7"; 61 optional-dependencies = { 62 huggingflace = [ 63 langdetect 64 sacremoses 65 sentencepiece 66 torch 67 transformers 68 ]; 69 local-inference = [ unstructured-inference ]; 70 s3 = [ 71 s3fs 72 fsspec 73 ]; 74 azure = [ 75 adlfs 76 fsspec 77 ]; 78 discord = [ ]; # discord-py 79 github = [ pygithub ]; 80 gitlab = [ python-gitlab ]; 81 reddit = [ praw ]; 82 slack = [ slack-sdk ]; 83 wikipedia = [ wikipedia ]; 84 google-drive = [ google-api-python-client ]; 85 gcs = [ ]; # gcsfs fsspec 86 elasticsearch = [ 87 elasticsearch8 88 jq 89 ]; 90 dropbox = [ ]; # dropboxdrivefs fsspec 91 confluence = [ atlassian-python-api ]; 92 }; 93in 94buildPythonPackage { 95 pname = "unstructured"; 96 inherit version; 97 format = "setuptools"; 98 99 src = fetchFromGitHub { 100 owner = "Unstructured-IO"; 101 repo = "unstructured"; 102 rev = "refs/tags/${version}"; 103 hash = "sha256-Ekfa454mL7isMX79bd/YXPPHnetSzo1Mlg/XJakYyDM="; 104 }; 105 106 propagatedBuildInputs = [ 107 chardet 108 filetype 109 lxml 110 msg-parser 111 nltk 112 openpyxl 113 pandas 114 pdf2image 115 pdfminer-six 116 pillow 117 pypandoc 118 python-docx 119 python-pptx 120 python-magic 121 markdown 122 requests 123 tabulate 124 xlrd 125 ]; 126 127 pythonImportsCheck = [ "unstructured" ]; 128 129 # test try to download punkt from nltk 130 # figure out how to make it available to enable the tests 131 doCheck = false; 132 133 nativeCheckInputs = [ 134 pytestCheckHook 135 black 136 coverage 137 click 138 freezegun 139 mypy 140 pytest-cov 141 pytest-mock 142 vcrpy 143 grpcio 144 ]; 145 146 passthru.optional-dependencies = optional-dependencies; 147 148 meta = with lib; { 149 description = "Open source libraries and APIs to build custom preprocessing pipelines for labeling, training, or production machine learning pipelines"; 150 mainProgram = "unstructured-ingest"; 151 homepage = "https://github.com/Unstructured-IO/unstructured"; 152 changelog = "https://github.com/Unstructured-IO/unstructured/blob/${version}/CHANGELOG.md"; 153 license = licenses.asl20; 154 maintainers = with maintainers; [ happysalada ]; 155 }; 156}