1{
2 lib,
3 buildPythonPackage,
4 fetchFromGitHub,
5 hatchling,
6 beautifulsoup4,
7 ffmpeg-headless,
8 mammoth,
9 markdownify,
10 numpy,
11 openai,
12 openpyxl,
13 pandas,
14 pathvalidate,
15 pdfminer-six,
16 puremagic,
17 pydub,
18 python-pptx,
19 requests,
20 speechrecognition,
21 youtube-transcript-api,
22 pytestCheckHook,
23 gitUpdater,
24}:
25
26buildPythonPackage {
27 pname = "markitdown";
28 version = "unstable-2024-12-18";
29 pyproject = true;
30
31 src = fetchFromGitHub {
32 owner = "microsoft";
33 repo = "markitdown";
34 rev = "3ce21a47abed0e4db162de1088d661887ae076ff";
35 hash = "sha256-5YafFL8OHNcGgB/qH6CmX0rTith1ZSRNIa+ktl4Ffvg=";
36 };
37
38 build-system = [ hatchling ];
39
40 dependencies = [
41 beautifulsoup4
42 ffmpeg-headless
43 mammoth
44 markdownify
45 numpy
46 openai
47 openpyxl
48 pandas
49 pathvalidate
50 pdfminer-six
51 puremagic
52 pydub
53 python-pptx
54 requests
55 speechrecognition
56 youtube-transcript-api
57 ];
58
59 pythonImportsCheck = [ "markitdown" ];
60
61 nativeCheckInputs = [ pytestCheckHook ];
62
63 disabledTests = [
64 # Require network access
65 "test_markitdown_remote"
66 ];
67
68 passthru.updateScripts = gitUpdater { };
69
70 meta = {
71 description = "Python tool for converting files and office documents to Markdown";
72 homepage = "https://github.com/microsoft/markitdown";
73 license = lib.licenses.mit;
74 maintainers = with lib.maintainers; [ drupol ];
75 };
76}