1{
2 lib,
3 buildPythonPackage,
4 fetchFromGitHub,
5 cmake,
6 pkg-config,
7 cxxopts,
8 poetry-core,
9 pybind11,
10 zlib,
11 nlohmann_json,
12 utf8cpp,
13 libjpeg,
14 qpdf,
15 loguru-cpp,
16 # python dependencies
17 tabulate,
18 pillow,
19 pydantic,
20 docling-core,
21 pytestCheckHook,
22}:
23
24buildPythonPackage rec {
25 pname = "docling-parse";
26 version = "4.1.0";
27 pyproject = true;
28
29 src = fetchFromGitHub {
30 owner = "docling-project";
31 repo = "docling-parse";
32 tag = "v${version}";
33 hash = "sha256-1vl5Ij25NXAwhoXLJ35lcr5r479jrdKd9DxWhYbCApw=";
34 };
35
36 patches = [
37 # Fixes test_parse unit tests
38 # export_to_textlines in docling-core >= 2.38.2 includes text direction
39 # by default, which is not included in upstream's groundtruth data.
40 # TODO: remove when docling-core version gets bumped in upstream's uv.lock
41 ./test_parse.patch
42 ];
43
44 dontUseCmakeConfigure = true;
45
46 nativeBuildInputs = [
47 cmake
48 pkg-config
49 ];
50
51 build-system = [
52 poetry-core
53 ];
54
55 env.NIX_CFLAGS_COMPILE = "-I${lib.getDev utf8cpp}/include/utf8cpp";
56
57 buildInputs = [
58 pybind11
59 cxxopts
60 libjpeg
61 loguru-cpp
62 nlohmann_json
63 qpdf
64 utf8cpp
65 zlib
66 ];
67
68 env.USE_SYSTEM_DEPS = true;
69
70 cmakeFlags = [
71 "-DUSE_SYSTEM_DEPS=True"
72 ];
73
74 dependencies = [
75 tabulate
76 pillow
77 pydantic
78 docling-core
79 ];
80
81 pythonRelaxDeps = [
82 "pydantic"
83 "pillow"
84 ];
85
86 pythonImportsCheck = [
87 "docling_parse"
88 ];
89
90 nativeCheckInputs = [
91 pytestCheckHook
92 ];
93
94 meta = {
95 changelog = "https://github.com/DS4SD/docling-parse/blob/${src.tag}/CHANGELOG.md";
96 description = "Simple package to extract text with coordinates from programmatic PDFs";
97 homepage = "https://github.com/DS4SD/docling-parse";
98 license = lib.licenses.mit;
99 maintainers = with lib.maintainers; [ drupol ];
100 };
101}