1{ lib 2, buildPythonPackage 3, fetchFromGitHub 4# propagated build inputs 5, chardet 6, filetype 7, lxml 8, msg-parser 9, nltk 10, openpyxl 11, pandas 12, pdf2image 13, pdfminer-six 14, pillow 15, pypandoc 16, python-docx 17, python-pptx 18, python-magic 19, markdown 20, requests 21, tabulate 22, xlrd 23# optional-dependencies 24, langdetect 25, sacremoses 26, sentencepiece 27, torch 28, transformers 29, unstructured-inference 30, s3fs 31, fsspec 32, adlfs 33# , discord-py 34, pygithub 35, python-gitlab 36, praw 37, slack-sdk 38, wikipedia 39, google-api-python-client 40# , gcsfs 41, elasticsearch8 42, jq 43# , dropboxdrivefs 44, atlassian-python-api 45# test dependencies 46, pytestCheckHook 47, black 48, coverage 49, click 50, freezegun 51# , label-studio-sdk 52, mypy 53, pytest-cov 54, pytest-mock 55, vcrpy 56, grpcio 57}: 58let 59 version = "0.10.30"; 60 optional-dependencies = { 61 huggingflace = [ 62 langdetect 63 sacremoses 64 sentencepiece 65 torch 66 transformers 67 ]; 68 local-inference = [ unstructured-inference ]; 69 s3 = [ s3fs fsspec ]; 70 azure = [ adlfs fsspec ]; 71 discord = [ ]; # discord-py 72 github = [ pygithub ]; 73 gitlab = [ python-gitlab ]; 74 reddit = [ praw ]; 75 slack = [ slack-sdk ]; 76 wikipedia = [ wikipedia ]; 77 google-drive = [ google-api-python-client ]; 78 gcs = []; # gcsfs fsspec 79 elasticsearch = [ elasticsearch8 jq ]; 80 dropbox = []; # dropboxdrivefs fsspec 81 confluence = [ atlassian-python-api ]; 82 }; 83in 84buildPythonPackage { 85 pname = "unstructured"; 86 inherit version; 87 format = "setuptools"; 88 89 src = fetchFromGitHub { 90 owner = "Unstructured-IO"; 91 repo = "unstructured"; 92 rev = "refs/tags/${version}"; 93 hash = "sha256-RaVg4XNmh1S5G1CHQiME7t/BmK0MI9M8wI2YTKjpqzM="; 94 }; 95 96 propagatedBuildInputs = [ 97 chardet 98 filetype 99 lxml 100 msg-parser 101 nltk 102 openpyxl 103 pandas 104 pdf2image 105 pdfminer-six 106 pillow 107 pypandoc 108 python-docx 109 python-pptx 110 python-magic 111 markdown 112 requests 113 tabulate 114 xlrd 115 ]; 116 117 pythonImportsCheck = [ "unstructured" ]; 118 119 # test try to download punkt from nltk 120 # figure out how to make it available to enable the tests 121 doCheck = false; 122 123 nativeCheckInputs = [ 124 pytestCheckHook 125 black 126 coverage 127 click 128 freezegun 129 mypy 130 pytest-cov 131 pytest-mock 132 vcrpy 133 grpcio 134 ]; 135 136 passthru.optional-dependencies = optional-dependencies; 137 138 meta = with lib; { 139 description = "Open source libraries and APIs to build custom preprocessing pipelines for labeling, training, or production machine learning pipelines"; 140 homepage = "https://github.com/Unstructured-IO/unstructured"; 141 changelog = "https://github.com/Unstructured-IO/unstructured/blob/${version}/CHANGELOG.md"; 142 license = licenses.asl20; 143 maintainers = with maintainers; [ happysalada ]; 144 }; 145}