1{
2 lib,
3 buildPythonPackage,
4 fetchFromGitHub,
5 hatchling,
6 beautifulsoup4,
7 ffmpeg-headless,
8 magika,
9 mammoth,
10 markdownify,
11 numpy,
12 openai,
13 openpyxl,
14 pandas,
15 pathvalidate,
16 pdfminer-six,
17 puremagic,
18 pydub,
19 python-pptx,
20 requests,
21 speechrecognition,
22 youtube-transcript-api,
23 olefile,
24 xlrd,
25 lxml,
26 pytestCheckHook,
27 gitUpdater,
28}:
29
30buildPythonPackage rec {
31 pname = "markitdown";
32 version = "0.1.1";
33 pyproject = true;
34
35 src = fetchFromGitHub {
36 owner = "microsoft";
37 repo = "markitdown";
38 tag = "v${version}";
39 hash = "sha256-siXam2a+ryyLBbciQgjd9k6zC8r46LbzjPMoc1dG0wk=";
40 };
41
42 sourceRoot = "${src.name}/packages/markitdown";
43
44 build-system = [ hatchling ];
45
46 dependencies = [
47 beautifulsoup4
48 ffmpeg-headless
49 lxml
50 magika
51 mammoth
52 markdownify
53 numpy
54 olefile
55 openai
56 openpyxl
57 pandas
58 pathvalidate
59 pdfminer-six
60 puremagic
61 pydub
62 python-pptx
63 requests
64 speechrecognition
65 xlrd
66 youtube-transcript-api
67 ];
68
69 pythonImportsCheck = [ "markitdown" ];
70
71 nativeCheckInputs = [ pytestCheckHook ];
72
73 disabledTests = [
74 # Require network access
75 "test_markitdown_remote"
76 "test_module_vectors"
77 "test_cli_vectors"
78 "test_module_misc"
79 ];
80
81 passthru.updateScripts = gitUpdater { };
82
83 meta = {
84 description = "Python tool for converting files and office documents to Markdown";
85 homepage = "https://github.com/microsoft/markitdown";
86 license = lib.licenses.mit;
87 maintainers = with lib.maintainers; [ drupol ];
88 };
89}