1{ lib
2, buildPythonPackage
3, pythonRelaxDepsHook
4, fetchFromGitHub
5, attrdict
6, beautifulsoup4
7, cython
8, fire
9, fonttools
10, lmdb
11, lxml
12, numpy
13, opencv4
14, openpyxl
15, pdf2docx
16, pillow
17, premailer
18, pyclipper
19, pymupdf
20, python-docx
21, rapidfuzz
22, scikit-image
23, shapely
24, tqdm
25, paddlepaddle
26, lanms-neo
27, polygon3
28}:
29
30let
31 version = "2.7.1";
32in
33buildPythonPackage {
34 pname = "paddleocr";
35 inherit version;
36 format = "setuptools";
37
38 src = fetchFromGitHub {
39 owner = "PaddlePaddle";
40 repo = "PaddleOCR";
41 rev = "v${version}";
42 hash = "sha256-5Dt4UL+7dwJNjcNnCVi3o8bLCt7/m/M6oh1vPu9rza8=";
43 };
44
45 patches = [
46 # The `ppocr.data.imaug` re-exports the `IaaAugment` and `CopyPaste`
47 # classes. These classes depend on the `imgaug` package which is
48 # unmaintained and has been removed from nixpkgs.
49 #
50 # The image OCR feature of PaddleOCR doesn't use these classes though, so
51 # they work even after stripping the the `IaaAugment` and `CopyPaste`
52 # exports. It probably breaks some of the OCR model creation tooling that
53 # PaddleOCR provides, however.
54 ./remove-import-imaug.patch
55 ];
56
57 nativeBuildInputs = [ pythonRelaxDepsHook ];
58 # trying to relax only pymupdf makes the whole build fail
59 pythonRelaxDeps = true;
60 pythonRemoveDeps = [
61 "imgaug"
62 "visualdl"
63 "opencv-python"
64 "opencv-contrib-python"
65 ];
66
67 propagatedBuildInputs = [
68 attrdict
69 beautifulsoup4
70 cython
71 fire
72 fonttools
73 lmdb
74 lxml
75 numpy
76 opencv4
77 openpyxl
78 pdf2docx
79 pillow
80 premailer
81 pyclipper
82 pymupdf
83 python-docx
84 rapidfuzz
85 scikit-image
86 shapely
87 tqdm
88 paddlepaddle
89 lanms-neo
90 polygon3
91 ];
92
93 # TODO: The tests depend, among possibly other things, on `cudatoolkit`.
94 # But Cudatoolkit fails to install.
95 # preCheck = "export HOME=$TMPDIR";
96 # nativeCheckInputs = with pkgs; [ which cudatoolkit ];
97 doCheck = false;
98
99 meta = with lib; {
100 homepage = "https://github.com/PaddlePaddle/PaddleOCR";
101 license = licenses.asl20;
102 description = "Multilingual OCR toolkits based on PaddlePaddle";
103 longDescription = ''
104 PaddleOCR aims to create multilingual, awesome, leading, and practical OCR
105 tools that help users train better models and apply them into practice.
106 '';
107 changelog = "https://github.com/PaddlePaddle/PaddleOCR/releases/tag/v${version}";
108 maintainers = with maintainers; [ happysalada ];
109 platforms = [ "x86_64-linux" "x86_64-darwin" "aarch64-darwin" ];
110 };
111}