1{
2 lib,
3 buildPythonPackage,
4 pythonRelaxDepsHook,
5 fetchFromGitHub,
6 attrdict,
7 beautifulsoup4,
8 cython,
9 fire,
10 fonttools,
11 lmdb,
12 lxml,
13 numpy,
14 opencv4,
15 openpyxl,
16 pdf2docx,
17 pillow,
18 premailer,
19 pyclipper,
20 pymupdf,
21 python-docx,
22 rapidfuzz,
23 scikit-image,
24 shapely,
25 tqdm,
26 paddlepaddle,
27 lanms-neo,
28 polygon3,
29}:
30
31let
32 version = "2.7.1";
33in
34buildPythonPackage {
35 pname = "paddleocr";
36 inherit version;
37 format = "setuptools";
38
39 src = fetchFromGitHub {
40 owner = "PaddlePaddle";
41 repo = "PaddleOCR";
42 rev = "v${version}";
43 hash = "sha256-5Dt4UL+7dwJNjcNnCVi3o8bLCt7/m/M6oh1vPu9rza8=";
44 };
45
46 patches = [
47 # The `ppocr.data.imaug` re-exports the `IaaAugment` and `CopyPaste`
48 # classes. These classes depend on the `imgaug` package which is
49 # unmaintained and has been removed from nixpkgs.
50 #
51 # The image OCR feature of PaddleOCR doesn't use these classes though, so
52 # they work even after stripping the the `IaaAugment` and `CopyPaste`
53 # exports. It probably breaks some of the OCR model creation tooling that
54 # PaddleOCR provides, however.
55 ./remove-import-imaug.patch
56 ];
57
58 nativeBuildInputs = [ pythonRelaxDepsHook ];
59 # trying to relax only pymupdf makes the whole build fail
60 pythonRelaxDeps = true;
61 pythonRemoveDeps = [
62 "imgaug"
63 "visualdl"
64 "opencv-python"
65 "opencv-contrib-python"
66 ];
67
68 propagatedBuildInputs = [
69 attrdict
70 beautifulsoup4
71 cython
72 fire
73 fonttools
74 lmdb
75 lxml
76 numpy
77 opencv4
78 openpyxl
79 pdf2docx
80 pillow
81 premailer
82 pyclipper
83 pymupdf
84 python-docx
85 rapidfuzz
86 scikit-image
87 shapely
88 tqdm
89 paddlepaddle
90 lanms-neo
91 polygon3
92 ];
93
94 # TODO: The tests depend, among possibly other things, on `cudatoolkit`.
95 # But Cudatoolkit fails to install.
96 # preCheck = "export HOME=$TMPDIR";
97 # nativeCheckInputs = with pkgs; [ which cudatoolkit ];
98 doCheck = false;
99
100 meta = with lib; {
101 homepage = "https://github.com/PaddlePaddle/PaddleOCR";
102 license = licenses.asl20;
103 description = "Multilingual OCR toolkits based on PaddlePaddle";
104 longDescription = ''
105 PaddleOCR aims to create multilingual, awesome, leading, and practical OCR
106 tools that help users train better models and apply them into practice.
107 '';
108 changelog = "https://github.com/PaddlePaddle/PaddleOCR/releases/tag/v${version}";
109 maintainers = with maintainers; [ happysalada ];
110 platforms = [
111 "x86_64-linux"
112 "x86_64-darwin"
113 "aarch64-darwin"
114 ];
115 };
116}