ocrmypdf: move to python3Packages

According to https://ocrmypdf.readthedocs.io/en/latest/api.html, it also
provides a Python API.

+247 -104
-1
pkgs/applications/office/paperless-ng/default.nix
··· 5 , ghostscript 6 , imagemagick 7 , jbig2enc 8 - , ocrmypdf 9 , optipng 10 , pngquant 11 , qpdf
··· 5 , ghostscript 6 , imagemagick 7 , jbig2enc 8 , optipng 9 , pngquant 10 , qpdf
+84
pkgs/development/python-modules/ocrmypdf/default.nix
···
··· 1 + { lib 2 + , buildPythonPackage 3 + , cffi 4 + , coloredlogs 5 + , fetchFromGitHub 6 + , ghostscript 7 + , img2pdf 8 + , importlib-resources 9 + , jbig2enc 10 + , leptonica 11 + , pdfminer 12 + , pikepdf 13 + , pillow 14 + , pluggy 15 + , pngquant 16 + , pytest-xdist 17 + , pytestCheckHook 18 + , reportlab 19 + , setuptools 20 + , setuptools-scm 21 + , setuptools-scm-git-archive 22 + , stdenv 23 + , substituteAll 24 + , tesseract4 25 + , tqdm 26 + , unpaper 27 + }: 28 + 29 + buildPythonPackage rec { 30 + pname = "ocrmypdf"; 31 + version = "12.5.0"; 32 + 33 + src = fetchFromGitHub { 34 + owner = "jbarlow83"; 35 + repo = "OCRmyPDF"; 36 + rev = "v${version}"; 37 + sha256 = "sha256-g80WedX+TGHE9EJ/RSgOc53PM17V3WZslUNaHoqKTo0="; 38 + }; 39 + 40 + patches = [ 41 + (substituteAll { 42 + src = ./paths.patch; 43 + gs = "${lib.getBin ghostscript}/bin/gs"; 44 + jbig2 = "${lib.getBin jbig2enc}/bin/jbig2"; 45 + liblept = "${lib.getLib leptonica}/lib/liblept${stdenv.hostPlatform.extensions.sharedLibrary}"; 46 + pngquant = "${lib.getBin pngquant}/bin/pngquant"; 47 + tesseract = "${lib.getBin tesseract4}/bin/tesseract"; 48 + unpaper = "${lib.getBin unpaper}/bin/unpaper"; 49 + }) 50 + ]; 51 + 52 + nativeBuildInputs = [ 53 + setuptools-scm-git-archive 54 + setuptools-scm 55 + ]; 56 + 57 + propagatedBuildInputs = [ 58 + cffi 59 + coloredlogs 60 + img2pdf 61 + importlib-resources 62 + pdfminer 63 + pikepdf 64 + pillow 65 + pluggy 66 + reportlab 67 + setuptools 68 + tqdm 69 + ]; 70 + 71 + checkInputs = [ 72 + pytest-xdist 73 + pytestCheckHook 74 + ]; 75 + 76 + meta = with lib; { 77 + homepage = "https://github.com/jbarlow83/OCRmyPDF"; 78 + description = "Adds an OCR text layer to scanned PDF files, allowing them to be searched"; 79 + license = with licenses; [ mpl20 mit ]; 80 + platforms = platforms.linux; 81 + maintainers = with maintainers; [ kiwi dotlambda ]; 82 + changelog = "https://github.com/jbarlow83/OCRmyPDF/blob/v${version}/docs/release_notes.rst"; 83 + }; 84 + }
+160
pkgs/development/python-modules/ocrmypdf/paths.patch
···
··· 1 + diff --git a/src/ocrmypdf/_exec/ghostscript.py b/src/ocrmypdf/_exec/ghostscript.py 2 + index 5c357f1b..f459763a 100644 3 + --- a/src/ocrmypdf/_exec/ghostscript.py 4 + +++ b/src/ocrmypdf/_exec/ghostscript.py 5 + @@ -25,28 +25,7 @@ from ocrmypdf.subprocess import get_version, run, run_polling_stderr 6 + 7 + log = logging.getLogger(__name__) 8 + 9 + -missing_gs_error = """ 10 + ---------------------------------------------------------------------- 11 + -This error normally occurs when ocrmypdf find can't Ghostscript. 12 + -Please ensure Ghostscript is installed and its location is added to 13 + -the system PATH environment variable. 14 + - 15 + -For details see: 16 + - https://ocrmypdf.readthedocs.io/en/latest/installation.html 17 + ---------------------------------------------------------------------- 18 + -""" 19 + - 20 + -_gswin = None 21 + -if os.name == 'nt': 22 + - _gswin = which('gswin64c') 23 + - if not _gswin: 24 + - _gswin = which('gswin32c') 25 + - if not _gswin: 26 + - raise MissingDependencyError(missing_gs_error) 27 + - _gswin = Path(_gswin).stem 28 + - 29 + -GS = _gswin if _gswin else 'gs' 30 + -del _gswin 31 + +GS = '@gs@' 32 + 33 + 34 + def version(): 35 + diff --git a/src/ocrmypdf/_exec/jbig2enc.py b/src/ocrmypdf/_exec/jbig2enc.py 36 + index 2e8a058b..65a09088 100644 37 + --- a/src/ocrmypdf/_exec/jbig2enc.py 38 + +++ b/src/ocrmypdf/_exec/jbig2enc.py 39 + @@ -14,7 +14,7 @@ from ocrmypdf.subprocess import get_version, run 40 + 41 + 42 + def version(): 43 + - return get_version('jbig2', regex=r'jbig2enc (\d+(\.\d+)*).*') 44 + + return get_version('@jbig2@', regex=r'jbig2enc (\d+(\.\d+)*).*') 45 + 46 + 47 + def available(): 48 + @@ -27,7 +27,7 @@ def available(): 49 + 50 + def convert_group(*, cwd, infiles, out_prefix): 51 + args = [ 52 + - 'jbig2', 53 + + '@jbig2@', 54 + '-b', 55 + out_prefix, 56 + '-s', # symbol mode (lossy) 57 + @@ -46,7 +46,7 @@ def convert_group_mp(args): 58 + 59 + 60 + def convert_single(*, cwd, infile, outfile): 61 + - args = ['jbig2', '-p', infile] 62 + + args = ['@jbig2@', '-p', infile] 63 + with open(outfile, 'wb') as fstdout: 64 + proc = run(args, cwd=cwd, stdout=fstdout, stderr=PIPE) 65 + proc.check_returncode() 66 + diff --git a/src/ocrmypdf/_exec/pngquant.py b/src/ocrmypdf/_exec/pngquant.py 67 + index ca8a4542..d0544174 100644 68 + --- a/src/ocrmypdf/_exec/pngquant.py 69 + +++ b/src/ocrmypdf/_exec/pngquant.py 70 + @@ -19,7 +19,7 @@ from ocrmypdf.subprocess import get_version, run 71 + 72 + 73 + def version(): 74 + - return get_version('pngquant', regex=r'(\d+(\.\d+)*).*') 75 + + return get_version('@pngquant@', regex=r'(\d+(\.\d+)*).*') 76 + 77 + 78 + def available(): 79 + @@ -46,7 +46,7 @@ def input_as_png(input_file: Path): 80 + def quantize(input_file: Path, output_file: Path, quality_min: int, quality_max: int): 81 + with input_as_png(input_file) as input_stream: 82 + args = [ 83 + - 'pngquant', 84 + + '@pngquant@', 85 + '--force', 86 + '--skip-if-larger', 87 + '--quality', 88 + diff --git a/src/ocrmypdf/_exec/tesseract.py b/src/ocrmypdf/_exec/tesseract.py 89 + index 33ead41e..5840f7c1 100644 90 + --- a/src/ocrmypdf/_exec/tesseract.py 91 + +++ b/src/ocrmypdf/_exec/tesseract.py 92 + @@ -78,7 +78,7 @@ class TesseractVersion(StrictVersion): 93 + 94 + 95 + def version(): 96 + - return get_version('tesseract', regex=r'tesseract\s(.+)') 97 + + return get_version('@tesseract@', regex=r'tesseract\s(.+)') 98 + 99 + 100 + def has_user_words(): 101 + @@ -100,7 +100,7 @@ def get_languages(): 102 + msg += output 103 + return msg 104 + 105 + - args_tess = ['tesseract', '--list-langs'] 106 + + args_tess = ['@tesseract@', '--list-langs'] 107 + try: 108 + proc = run( 109 + args_tess, 110 + @@ -122,7 +122,7 @@ def get_languages(): 111 + 112 + 113 + def tess_base_args(langs: List[str], engine_mode: Optional[int]) -> List[str]: 114 + - args = ['tesseract'] 115 + + args = ['@tesseract@'] 116 + if langs: 117 + args.extend(['-l', '+'.join(langs)]) 118 + if engine_mode is not None: 119 + diff --git a/src/ocrmypdf/_exec/unpaper.py b/src/ocrmypdf/_exec/unpaper.py 120 + index 3c3ae72c..d269966a 100644 121 + --- a/src/ocrmypdf/_exec/unpaper.py 122 + +++ b/src/ocrmypdf/_exec/unpaper.py 123 + @@ -31,7 +31,7 @@ log = logging.getLogger(__name__) 124 + 125 + 126 + def version() -> str: 127 + - return get_version('unpaper') 128 + + return get_version('@unpaper@') 129 + 130 + 131 + def _setup_unpaper_io(tmpdir: Path, input_file: Path) -> Tuple[Path, Path]: 132 + @@ -71,7 +71,7 @@ def _setup_unpaper_io(tmpdir: Path, input_file: Path) -> Tuple[Path, Path]: 133 + def run( 134 + input_file: Path, output_file: Path, *, dpi: DecFloat, mode_args: List[str] 135 + ) -> None: 136 + - args_unpaper = ['unpaper', '-v', '--dpi', str(round(dpi, 6))] + mode_args 137 + + args_unpaper = ['@unpaper@', '-v', '--dpi', str(round(dpi, 6))] + mode_args 138 + 139 + with TemporaryDirectory() as tmpdir: 140 + input_pnm, output_pnm = _setup_unpaper_io(Path(tmpdir), input_file) 141 + diff --git a/src/ocrmypdf/leptonica.py b/src/ocrmypdf/leptonica.py 142 + index e4814f1a..fdaf7ea4 100644 143 + --- a/src/ocrmypdf/leptonica.py 144 + +++ b/src/ocrmypdf/leptonica.py 145 + @@ -33,14 +33,7 @@ from ocrmypdf.lib._leptonica import ffi 146 + 147 + logger = logging.getLogger(__name__) 148 + 149 + -if os.name == 'nt': 150 + - from ocrmypdf.subprocess._windows import shim_env_path 151 + - 152 + - libname = 'liblept-5' 153 + - os.environ['PATH'] = shim_env_path() 154 + -else: 155 + - libname = 'lept' 156 + -_libpath = find_library(libname) 157 + +_libpath = '@liblept@' 158 + if not _libpath: 159 + raise MissingDependencyError( 160 + """
-89
pkgs/tools/text/ocrmypdf/default.nix
··· 1 - { fetchFromGitHub 2 - , ghostscript 3 - , img2pdf 4 - , jbig2enc 5 - , leptonica 6 - , pngquant 7 - , python3 8 - , python3Packages 9 - , qpdf 10 - , lib 11 - , stdenv 12 - , tesseract4 13 - , unpaper 14 - , substituteAll 15 - }: 16 - let 17 - inherit (python3Packages) buildPythonApplication; 18 - 19 - runtimeDeps = with python3Packages; [ 20 - ghostscript 21 - jbig2enc 22 - leptonica 23 - pngquant 24 - qpdf 25 - tesseract4 26 - unpaper 27 - pillow 28 - ]; 29 - 30 - in 31 - buildPythonApplication rec { 32 - pname = "ocrmypdf"; 33 - version = "12.5.0"; 34 - 35 - src = fetchFromGitHub { 36 - owner = "jbarlow83"; 37 - repo = "OCRmyPDF"; 38 - rev = "v${version}"; 39 - sha256 = "sha256-g80WedX+TGHE9EJ/RSgOc53PM17V3WZslUNaHoqKTo0="; 40 - }; 41 - 42 - nativeBuildInputs = with python3Packages; [ 43 - setuptools 44 - setuptools-scm-git-archive 45 - setuptools-scm 46 - ]; 47 - 48 - propagatedBuildInputs = with python3Packages; [ 49 - cffi 50 - coloredlogs 51 - img2pdf 52 - importlib-resources 53 - pdfminer 54 - pluggy 55 - pikepdf 56 - pillow 57 - reportlab 58 - setuptools 59 - tqdm 60 - ]; 61 - 62 - checkInputs = with python3Packages; [ 63 - pypdf2 64 - pytest 65 - pytest-helpers-namespace 66 - pytest-xdist 67 - pytest-cov 68 - python-xmp-toolkit 69 - pytestCheckHook 70 - ] ++ runtimeDeps; 71 - 72 - patches = [ 73 - (substituteAll { 74 - src = ./liblept.patch; 75 - liblept = "${lib.getLib leptonica}/lib/liblept${stdenv.hostPlatform.extensions.sharedLibrary}"; 76 - }) 77 - ]; 78 - 79 - makeWrapperArgs = [ "--prefix PATH : ${lib.makeBinPath [ ghostscript jbig2enc pngquant qpdf tesseract4 unpaper ]}" ]; 80 - 81 - meta = with lib; { 82 - homepage = "https://github.com/jbarlow83/OCRmyPDF"; 83 - description = "Adds an OCR text layer to scanned PDF files, allowing them to be searched"; 84 - license = with licenses; [ mpl20 mit ]; 85 - platforms = platforms.linux; 86 - maintainers = [ maintainers.kiwi ]; 87 - changelog = "https://github.com/jbarlow83/OCRmyPDF/blob/v${version}/docs/release_notes.rst"; 88 - }; 89 - }
···
-13
pkgs/tools/text/ocrmypdf/liblept.patch
··· 1 - diff --git a/src/ocrmypdf/leptonica.py b/src/ocrmypdf/leptonica.py 2 - index 328b063..b993cc9 100644 3 - --- a/src/ocrmypdf/leptonica.py 4 - +++ b/src/ocrmypdf/leptonica.py 5 - @@ -46,7 +46,7 @@ if os.name == 'nt': 6 - os.environ['PATH'] = shim_paths_with_program_files() 7 - else: 8 - libname = 'lept' 9 - -_libpath = find_library(libname) 10 - +_libpath = '@liblept@' 11 - if not _libpath: 12 - raise MissingDependencyError( 13 - """
···
+1 -1
pkgs/top-level/all-packages.nix
··· 3228 3229 oci-cli = callPackage ../tools/admin/oci-cli { }; 3230 3231 - ocrmypdf = callPackage ../tools/text/ocrmypdf { }; 3232 3233 ocrfeeder = callPackage ../applications/graphics/ocrfeeder { }; 3234
··· 3228 3229 oci-cli = callPackage ../tools/admin/oci-cli { }; 3230 3231 + ocrmypdf = with python3.pkgs; toPythonApplication ocrmypdf; 3232 3233 ocrfeeder = callPackage ../applications/graphics/ocrfeeder { }; 3234
+2
pkgs/top-level/python-packages.nix
··· 5059 5060 oci = callPackage ../development/python-modules/oci { }; 5061 5062 od = callPackage ../development/python-modules/od { }; 5063 5064 odfpy = callPackage ../development/python-modules/odfpy { };
··· 5059 5060 oci = callPackage ../development/python-modules/oci { }; 5061 5062 + ocrmypdf = callPackage ../development/python-modules/ocrmypdf { }; 5063 + 5064 od = callPackage ../development/python-modules/od { }; 5065 5066 odfpy = callPackage ../development/python-modules/odfpy { };