Clone of https://github.com/NixOS/nixpkgs.git (to stress-test knotserver)
at 17.09-beta 13 kB view raw
1This patch is required for pyocr to work with Tesseract version 3.05.00 2and has been submitted upstream at the following URL: 3 4https://github.com/jflesch/pyocr/pull/62 5 6diff --git a/src/pyocr/builders.py b/src/pyocr/builders.py 7index 73c964d..20f390c 100644 8--- a/src/pyocr/builders.py 9+++ b/src/pyocr/builders.py 10@@ -240,8 +240,10 @@ class BaseBuilder(object): 11 cuneiform_args : Arguments passed to the Cuneiform command line. 12 """ 13 14- def __init__(self, file_extensions, tesseract_configs, cuneiform_args): 15+ def __init__(self, file_extensions, tesseract_flags, tesseract_configs, 16+ cuneiform_args): 17 self.file_extensions = file_extensions 18+ self.tesseract_flags = tesseract_flags 19 self.tesseract_configs = tesseract_configs 20 self.cuneiform_args = cuneiform_args 21 22@@ -298,7 +300,7 @@ class TextBuilder(BaseBuilder): 23 def __init__(self, tesseract_layout=3, cuneiform_dotmatrix=False, 24 cuneiform_fax=False, cuneiform_singlecolumn=False): 25 file_ext = ["txt"] 26- tess_conf = ["-psm", str(tesseract_layout)] 27+ tess_flags = ["-psm", str(tesseract_layout)] 28 cun_args = ["-f", "text"] 29 # Add custom cuneiform parameters if needed 30 for par, arg in [(cuneiform_dotmatrix, "--dotmatrix"), 31@@ -306,7 +308,7 @@ class TextBuilder(BaseBuilder): 32 (cuneiform_singlecolumn, "--singlecolumn")]: 33 if par: 34 cun_args.append(arg) 35- super(TextBuilder, self).__init__(file_ext, tess_conf, cun_args) 36+ super(TextBuilder, self).__init__(file_ext, tess_flags, [], cun_args) 37 self.tesseract_layout = tesseract_layout 38 self.built_text = [] 39 40@@ -540,9 +542,11 @@ class WordBoxBuilder(BaseBuilder): 41 42 def __init__(self, tesseract_layout=1): 43 file_ext = ["html", "hocr"] 44- tess_conf = ["hocr", "-psm", str(tesseract_layout)] 45+ tess_flags = ["-psm", str(tesseract_layout)] 46+ tess_conf = ["hocr"] 47 cun_args = ["-f", "hocr"] 48- super(WordBoxBuilder, self).__init__(file_ext, tess_conf, cun_args) 49+ super(WordBoxBuilder, self).__init__(file_ext, tess_flags, tess_conf, 50+ cun_args) 51 self.word_boxes = [] 52 self.tesseract_layout = tesseract_layout 53 54@@ -614,9 +618,11 @@ class LineBoxBuilder(BaseBuilder): 55 56 def __init__(self, tesseract_layout=1): 57 file_ext = ["html", "hocr"] 58- tess_conf = ["hocr", "-psm", str(tesseract_layout)] 59+ tess_flags = ["-psm", str(tesseract_layout)] 60+ tess_conf = ["hocr"] 61 cun_args = ["-f", "hocr"] 62- super(LineBoxBuilder, self).__init__(file_ext, tess_conf, cun_args) 63+ super(LineBoxBuilder, self).__init__(file_ext, tess_flags, tess_conf, 64+ cun_args) 65 self.lines = [] 66 self.tesseract_layout = tesseract_layout 67 68diff --git a/src/pyocr/libtesseract/tesseract_raw.py b/src/pyocr/libtesseract/tesseract_raw.py 69index 0c2259a..f7ab309 100644 70--- a/src/pyocr/libtesseract/tesseract_raw.py 71+++ b/src/pyocr/libtesseract/tesseract_raw.py 72@@ -263,11 +263,22 @@ if g_libtesseract: 73 ] 74 g_libtesseract.TessDeleteText.restype = None 75 76- g_libtesseract.TessBaseAPIDetectOS.argtypes = [ 77- ctypes.c_void_p, # TessBaseAPI* 78- ctypes.POINTER(OSResults), 79- ] 80- g_libtesseract.TessBaseAPIDetectOS.restype = ctypes.c_bool 81+ if hasattr(g_libtesseract, 'TessBaseAPIDetectOrientationScript'): 82+ g_libtesseract.TessBaseAPIDetectOrientationScript.argtypes = [ 83+ ctypes.c_void_p, # TessBaseAPI* 84+ ctypes.POINTER(ctypes.c_int), # orient_deg 85+ ctypes.POINTER(ctypes.c_float), # orient_conf 86+ ctypes.POINTER(ctypes.c_char_p), # script_name 87+ ctypes.POINTER(ctypes.c_float), # script_conf 88+ ] 89+ g_libtesseract.TessBaseAPIDetectOrientationScript.restype = \ 90+ ctypes.c_bool 91+ else: 92+ g_libtesseract.TessBaseAPIDetectOS.argtypes = [ 93+ ctypes.c_void_p, # TessBaseAPI* 94+ ctypes.POINTER(OSResults), 95+ ] 96+ g_libtesseract.TessBaseAPIDetectOS.restype = ctypes.c_bool 97 98 99 def init(lang=None): 100@@ -526,15 +537,37 @@ def detect_os(handle): 101 global g_libtesseract 102 assert(g_libtesseract) 103 104- results = OSResults() 105- r = g_libtesseract.TessBaseAPIDetectOS( 106- ctypes.c_void_p(handle), 107- ctypes.pointer(results) 108- ) 109- if not r: 110- raise TesseractError("detect_orientation failed", 111- "TessBaseAPIDetectOS() failed") 112- return { 113- "orientation": results.best_orientation_id, 114- "confidence": results.best_oconfidence, 115- } 116+ # Use the new API function if it is available, because since Tesseract 117+ # 3.05.00 the old API function _always_ returns False. 118+ if hasattr(g_libtesseract, 'TessBaseAPIDetectOrientationScript'): 119+ orientation_deg = ctypes.c_int(0) 120+ orientation_confidence = ctypes.c_float(0.0) 121+ 122+ r = g_libtesseract.TessBaseAPIDetectOrientationScript( 123+ ctypes.c_void_p(handle), 124+ ctypes.byref(orientation_deg), 125+ ctypes.byref(orientation_confidence), 126+ None, # script_name 127+ None # script_confidence 128+ ) 129+ 130+ if not r: 131+ raise TesseractError("detect_orientation failed", 132+ "TessBaseAPIDetectOrientationScript() failed") 133+ return { 134+ "orientation": round(orientation_deg.value / 90), 135+ "confidence": orientation_confidence.value, 136+ } 137+ else: # old API (before Tesseract 3.05.00) 138+ results = OSResults() 139+ r = g_libtesseract.TessBaseAPIDetectOS( 140+ ctypes.c_void_p(handle), 141+ ctypes.pointer(results) 142+ ) 143+ if not r: 144+ raise TesseractError("detect_orientation failed", 145+ "TessBaseAPIDetectOS() failed") 146+ return { 147+ "orientation": results.best_orientation_id, 148+ "confidence": results.best_oconfidence, 149+ } 150diff --git a/src/pyocr/tesseract.py b/src/pyocr/tesseract.py 151index 99b0121..658c96b 100755 152--- a/src/pyocr/tesseract.py 153+++ b/src/pyocr/tesseract.py 154@@ -22,6 +22,8 @@ import os 155 import subprocess 156 import sys 157 import tempfile 158+import contextlib 159+import shutil 160 161 from . import builders 162 from . import error 163@@ -62,9 +64,11 @@ class CharBoxBuilder(builders.BaseBuilder): 164 165 def __init__(self): 166 file_ext = ["box"] 167+ tess_flags = [] 168 tess_conf = ["batch.nochop", "makebox"] 169 cun_args = [] 170- super(CharBoxBuilder, self).__init__(file_ext, tess_conf, cun_args) 171+ super(CharBoxBuilder, self).__init__(file_ext, tess_flags, tess_conf, 172+ cun_args) 173 self.tesseract_layout = 1 174 175 @staticmethod 176@@ -173,18 +177,19 @@ def detect_orientation(image, lang=None): 177 TesseractError --- if no script detected on the image 178 """ 179 _set_environment() 180- with temp_file(".bmp") as input_file: 181- command = [TESSERACT_CMD, input_file.name, 'stdout', "-psm", "0"] 182+ with temp_dir() as tmpdir: 183+ command = [TESSERACT_CMD, "input.bmp", 'stdout', "-psm", "0"] 184 if lang is not None: 185 command += ['-l', lang] 186 187 if image.mode != "RGB": 188 image = image.convert("RGB") 189- image.save(input_file.name) 190+ image.save(os.path.join(tmpdir, "input.bmp")) 191 192 proc = subprocess.Popen(command, stdin=subprocess.PIPE, shell=False, 193 startupinfo=g_subprocess_startup_info, 194 creationflags=g_creation_flags, 195+ cwd=tmpdir, 196 stdout=subprocess.PIPE, 197 stderr=subprocess.STDOUT) 198 proc.stdin.close() 199@@ -224,8 +229,8 @@ def get_available_builders(): 200 ] 201 202 203-def run_tesseract(input_filename, output_filename_base, lang=None, 204- configs=None): 205+def run_tesseract(input_filename, output_filename_base, cwd=None, lang=None, 206+ flags=None, configs=None): 207 ''' 208 Runs Tesseract: 209 `TESSERACT_CMD` \ 210@@ -238,6 +243,8 @@ def run_tesseract(input_filename, output_filename_base, lang=None, 211 input_filename --- image to read 212 output_filename_base --- file name in which must be stored the result 213 (without the extension) 214+ cwd --- Run Tesseract in the specified working directory or use current 215+ one if None 216 lang --- Tesseract language to use (if None, none will be specified) 217 config --- List of Tesseract configs to use (if None, none will be 218 specified) 219@@ -252,10 +259,13 @@ def run_tesseract(input_filename, output_filename_base, lang=None, 220 if lang is not None: 221 command += ['-l', lang] 222 223+ if flags is not None: 224+ command += flags 225+ 226 if configs is not None: 227 command += configs 228 229- proc = subprocess.Popen(command, 230+ proc = subprocess.Popen(command, cwd=cwd, 231 startupinfo=g_subprocess_startup_info, 232 creationflags=g_creation_flags, 233 stdout=subprocess.PIPE, 234@@ -301,11 +311,18 @@ class ReOpenableTempfile(object): 235 self.name = None 236 237 238-def temp_file(suffix): 239- ''' Returns a temporary file ''' 240- if os.name == 'nt': # Windows 241- return ReOpenableTempfile(suffix) 242- return tempfile.NamedTemporaryFile(prefix='tess_', suffix=suffix) 243+@contextlib.contextmanager 244+def temp_dir(): 245+ """ 246+ A context manager for maintaining a temporary directory 247+ """ 248+ # NOTE: Drop this as soon as we don't support Python 2.7 anymore, because 249+ # since Python 3.2 there is a context manager called TemporaryDirectory(). 250+ path = tempfile.mkdtemp(prefix='tess_') 251+ try: 252+ yield path 253+ finally: 254+ shutil.rmtree(path) 255 256 257 def image_to_string(image, lang=None, builder=None): 258@@ -329,23 +346,20 @@ def image_to_string(image, lang=None, builder=None): 259 260 if builder is None: 261 builder = builders.TextBuilder() 262- with temp_file(".bmp") as input_file: 263- with temp_file('') as output_file: 264- output_file_name_base = output_file.name 265- 266+ with temp_dir() as tmpdir: 267 if image.mode != "RGB": 268 image = image.convert("RGB") 269- image.save(input_file.name) 270- (status, errors) = run_tesseract(input_file.name, 271- output_file_name_base, 272+ image.save(os.path.join(tmpdir, "input.bmp")) 273+ (status, errors) = run_tesseract("input.bmp", "output", cwd=tmpdir, 274 lang=lang, 275+ flags=builder.tesseract_flags, 276 configs=builder.tesseract_configs) 277 if status: 278 raise TesseractError(status, errors) 279 280 output_file_name = "ERROR" 281 for file_extension in builder.file_extensions: 282- output_file_name = ('%s.%s' % (output_file_name_base, 283+ output_file_name = ('%s.%s' % (os.path.join(tmpdir, "output"), 284 file_extension)) 285 if not os.access(output_file_name, os.F_OK): 286 continue 287diff --git a/tests/tests_libtesseract.py b/tests/tests_libtesseract.py 288index ccddd07..1ac2a4b 100644 289--- a/tests/tests_libtesseract.py 290+++ b/tests/tests_libtesseract.py 291@@ -33,8 +33,9 @@ class TestContext(unittest.TestCase): 292 (3, 3, 0), 293 (3, 4, 0), 294 (3, 4, 1), 295+ (3, 5, 0), 296 ), ("Tesseract does not have the expected version" 297- " (3.4.0) ! Some tests will be skipped !")) 298+ " (3.5.0) ! Some tests will be skipped !")) 299 300 def test_langs(self): 301 langs = libtesseract.get_available_languages() 302diff --git a/tests/tests_tesseract.py b/tests/tests_tesseract.py 303index e29c512..fa4d483 100644 304--- a/tests/tests_tesseract.py 305+++ b/tests/tests_tesseract.py 306@@ -27,8 +27,9 @@ class TestContext(unittest.TestCase): 307 (3, 3, 0), 308 (3, 4, 0), 309 (3, 4, 1), 310+ (3, 5, 0), 311 ), ("Tesseract does not have the expected version" 312- " (3.4.0) ! Some tests will be skipped !")) 313+ " (3.5.0) ! Some tests will be skipped !")) 314 315 def test_langs(self): 316 langs = tesseract.get_available_languages()