1This patch is required for pyocr to work with Tesseract version 3.05.00
2and has been submitted upstream at the following URL:
3
4https://github.com/jflesch/pyocr/pull/62
5
6diff --git a/src/pyocr/builders.py b/src/pyocr/builders.py
7index 73c964d..20f390c 100644
8--- a/src/pyocr/builders.py
9+++ b/src/pyocr/builders.py
10@@ -240,8 +240,10 @@ class BaseBuilder(object):
11 cuneiform_args : Arguments passed to the Cuneiform command line.
12 """
13
14- def __init__(self, file_extensions, tesseract_configs, cuneiform_args):
15+ def __init__(self, file_extensions, tesseract_flags, tesseract_configs,
16+ cuneiform_args):
17 self.file_extensions = file_extensions
18+ self.tesseract_flags = tesseract_flags
19 self.tesseract_configs = tesseract_configs
20 self.cuneiform_args = cuneiform_args
21
22@@ -298,7 +300,7 @@ class TextBuilder(BaseBuilder):
23 def __init__(self, tesseract_layout=3, cuneiform_dotmatrix=False,
24 cuneiform_fax=False, cuneiform_singlecolumn=False):
25 file_ext = ["txt"]
26- tess_conf = ["-psm", str(tesseract_layout)]
27+ tess_flags = ["-psm", str(tesseract_layout)]
28 cun_args = ["-f", "text"]
29 # Add custom cuneiform parameters if needed
30 for par, arg in [(cuneiform_dotmatrix, "--dotmatrix"),
31@@ -306,7 +308,7 @@ class TextBuilder(BaseBuilder):
32 (cuneiform_singlecolumn, "--singlecolumn")]:
33 if par:
34 cun_args.append(arg)
35- super(TextBuilder, self).__init__(file_ext, tess_conf, cun_args)
36+ super(TextBuilder, self).__init__(file_ext, tess_flags, [], cun_args)
37 self.tesseract_layout = tesseract_layout
38 self.built_text = []
39
40@@ -540,9 +542,11 @@ class WordBoxBuilder(BaseBuilder):
41
42 def __init__(self, tesseract_layout=1):
43 file_ext = ["html", "hocr"]
44- tess_conf = ["hocr", "-psm", str(tesseract_layout)]
45+ tess_flags = ["-psm", str(tesseract_layout)]
46+ tess_conf = ["hocr"]
47 cun_args = ["-f", "hocr"]
48- super(WordBoxBuilder, self).__init__(file_ext, tess_conf, cun_args)
49+ super(WordBoxBuilder, self).__init__(file_ext, tess_flags, tess_conf,
50+ cun_args)
51 self.word_boxes = []
52 self.tesseract_layout = tesseract_layout
53
54@@ -614,9 +618,11 @@ class LineBoxBuilder(BaseBuilder):
55
56 def __init__(self, tesseract_layout=1):
57 file_ext = ["html", "hocr"]
58- tess_conf = ["hocr", "-psm", str(tesseract_layout)]
59+ tess_flags = ["-psm", str(tesseract_layout)]
60+ tess_conf = ["hocr"]
61 cun_args = ["-f", "hocr"]
62- super(LineBoxBuilder, self).__init__(file_ext, tess_conf, cun_args)
63+ super(LineBoxBuilder, self).__init__(file_ext, tess_flags, tess_conf,
64+ cun_args)
65 self.lines = []
66 self.tesseract_layout = tesseract_layout
67
68diff --git a/src/pyocr/libtesseract/tesseract_raw.py b/src/pyocr/libtesseract/tesseract_raw.py
69index 0c2259a..f7ab309 100644
70--- a/src/pyocr/libtesseract/tesseract_raw.py
71+++ b/src/pyocr/libtesseract/tesseract_raw.py
72@@ -263,11 +263,22 @@ if g_libtesseract:
73 ]
74 g_libtesseract.TessDeleteText.restype = None
75
76- g_libtesseract.TessBaseAPIDetectOS.argtypes = [
77- ctypes.c_void_p, # TessBaseAPI*
78- ctypes.POINTER(OSResults),
79- ]
80- g_libtesseract.TessBaseAPIDetectOS.restype = ctypes.c_bool
81+ if hasattr(g_libtesseract, 'TessBaseAPIDetectOrientationScript'):
82+ g_libtesseract.TessBaseAPIDetectOrientationScript.argtypes = [
83+ ctypes.c_void_p, # TessBaseAPI*
84+ ctypes.POINTER(ctypes.c_int), # orient_deg
85+ ctypes.POINTER(ctypes.c_float), # orient_conf
86+ ctypes.POINTER(ctypes.c_char_p), # script_name
87+ ctypes.POINTER(ctypes.c_float), # script_conf
88+ ]
89+ g_libtesseract.TessBaseAPIDetectOrientationScript.restype = \
90+ ctypes.c_bool
91+ else:
92+ g_libtesseract.TessBaseAPIDetectOS.argtypes = [
93+ ctypes.c_void_p, # TessBaseAPI*
94+ ctypes.POINTER(OSResults),
95+ ]
96+ g_libtesseract.TessBaseAPIDetectOS.restype = ctypes.c_bool
97
98
99 def init(lang=None):
100@@ -526,15 +537,37 @@ def detect_os(handle):
101 global g_libtesseract
102 assert(g_libtesseract)
103
104- results = OSResults()
105- r = g_libtesseract.TessBaseAPIDetectOS(
106- ctypes.c_void_p(handle),
107- ctypes.pointer(results)
108- )
109- if not r:
110- raise TesseractError("detect_orientation failed",
111- "TessBaseAPIDetectOS() failed")
112- return {
113- "orientation": results.best_orientation_id,
114- "confidence": results.best_oconfidence,
115- }
116+ # Use the new API function if it is available, because since Tesseract
117+ # 3.05.00 the old API function _always_ returns False.
118+ if hasattr(g_libtesseract, 'TessBaseAPIDetectOrientationScript'):
119+ orientation_deg = ctypes.c_int(0)
120+ orientation_confidence = ctypes.c_float(0.0)
121+
122+ r = g_libtesseract.TessBaseAPIDetectOrientationScript(
123+ ctypes.c_void_p(handle),
124+ ctypes.byref(orientation_deg),
125+ ctypes.byref(orientation_confidence),
126+ None, # script_name
127+ None # script_confidence
128+ )
129+
130+ if not r:
131+ raise TesseractError("detect_orientation failed",
132+ "TessBaseAPIDetectOrientationScript() failed")
133+ return {
134+ "orientation": round(orientation_deg.value / 90),
135+ "confidence": orientation_confidence.value,
136+ }
137+ else: # old API (before Tesseract 3.05.00)
138+ results = OSResults()
139+ r = g_libtesseract.TessBaseAPIDetectOS(
140+ ctypes.c_void_p(handle),
141+ ctypes.pointer(results)
142+ )
143+ if not r:
144+ raise TesseractError("detect_orientation failed",
145+ "TessBaseAPIDetectOS() failed")
146+ return {
147+ "orientation": results.best_orientation_id,
148+ "confidence": results.best_oconfidence,
149+ }
150diff --git a/src/pyocr/tesseract.py b/src/pyocr/tesseract.py
151index 99b0121..658c96b 100755
152--- a/src/pyocr/tesseract.py
153+++ b/src/pyocr/tesseract.py
154@@ -22,6 +22,8 @@ import os
155 import subprocess
156 import sys
157 import tempfile
158+import contextlib
159+import shutil
160
161 from . import builders
162 from . import error
163@@ -62,9 +64,11 @@ class CharBoxBuilder(builders.BaseBuilder):
164
165 def __init__(self):
166 file_ext = ["box"]
167+ tess_flags = []
168 tess_conf = ["batch.nochop", "makebox"]
169 cun_args = []
170- super(CharBoxBuilder, self).__init__(file_ext, tess_conf, cun_args)
171+ super(CharBoxBuilder, self).__init__(file_ext, tess_flags, tess_conf,
172+ cun_args)
173 self.tesseract_layout = 1
174
175 @staticmethod
176@@ -173,18 +177,19 @@ def detect_orientation(image, lang=None):
177 TesseractError --- if no script detected on the image
178 """
179 _set_environment()
180- with temp_file(".bmp") as input_file:
181- command = [TESSERACT_CMD, input_file.name, 'stdout', "-psm", "0"]
182+ with temp_dir() as tmpdir:
183+ command = [TESSERACT_CMD, "input.bmp", 'stdout', "-psm", "0"]
184 if lang is not None:
185 command += ['-l', lang]
186
187 if image.mode != "RGB":
188 image = image.convert("RGB")
189- image.save(input_file.name)
190+ image.save(os.path.join(tmpdir, "input.bmp"))
191
192 proc = subprocess.Popen(command, stdin=subprocess.PIPE, shell=False,
193 startupinfo=g_subprocess_startup_info,
194 creationflags=g_creation_flags,
195+ cwd=tmpdir,
196 stdout=subprocess.PIPE,
197 stderr=subprocess.STDOUT)
198 proc.stdin.close()
199@@ -224,8 +229,8 @@ def get_available_builders():
200 ]
201
202
203-def run_tesseract(input_filename, output_filename_base, lang=None,
204- configs=None):
205+def run_tesseract(input_filename, output_filename_base, cwd=None, lang=None,
206+ flags=None, configs=None):
207 '''
208 Runs Tesseract:
209 `TESSERACT_CMD` \
210@@ -238,6 +243,8 @@ def run_tesseract(input_filename, output_filename_base, lang=None,
211 input_filename --- image to read
212 output_filename_base --- file name in which must be stored the result
213 (without the extension)
214+ cwd --- Run Tesseract in the specified working directory or use current
215+ one if None
216 lang --- Tesseract language to use (if None, none will be specified)
217 config --- List of Tesseract configs to use (if None, none will be
218 specified)
219@@ -252,10 +259,13 @@ def run_tesseract(input_filename, output_filename_base, lang=None,
220 if lang is not None:
221 command += ['-l', lang]
222
223+ if flags is not None:
224+ command += flags
225+
226 if configs is not None:
227 command += configs
228
229- proc = subprocess.Popen(command,
230+ proc = subprocess.Popen(command, cwd=cwd,
231 startupinfo=g_subprocess_startup_info,
232 creationflags=g_creation_flags,
233 stdout=subprocess.PIPE,
234@@ -301,11 +311,18 @@ class ReOpenableTempfile(object):
235 self.name = None
236
237
238-def temp_file(suffix):
239- ''' Returns a temporary file '''
240- if os.name == 'nt': # Windows
241- return ReOpenableTempfile(suffix)
242- return tempfile.NamedTemporaryFile(prefix='tess_', suffix=suffix)
243+@contextlib.contextmanager
244+def temp_dir():
245+ """
246+ A context manager for maintaining a temporary directory
247+ """
248+ # NOTE: Drop this as soon as we don't support Python 2.7 anymore, because
249+ # since Python 3.2 there is a context manager called TemporaryDirectory().
250+ path = tempfile.mkdtemp(prefix='tess_')
251+ try:
252+ yield path
253+ finally:
254+ shutil.rmtree(path)
255
256
257 def image_to_string(image, lang=None, builder=None):
258@@ -329,23 +346,20 @@ def image_to_string(image, lang=None, builder=None):
259
260 if builder is None:
261 builder = builders.TextBuilder()
262- with temp_file(".bmp") as input_file:
263- with temp_file('') as output_file:
264- output_file_name_base = output_file.name
265-
266+ with temp_dir() as tmpdir:
267 if image.mode != "RGB":
268 image = image.convert("RGB")
269- image.save(input_file.name)
270- (status, errors) = run_tesseract(input_file.name,
271- output_file_name_base,
272+ image.save(os.path.join(tmpdir, "input.bmp"))
273+ (status, errors) = run_tesseract("input.bmp", "output", cwd=tmpdir,
274 lang=lang,
275+ flags=builder.tesseract_flags,
276 configs=builder.tesseract_configs)
277 if status:
278 raise TesseractError(status, errors)
279
280 output_file_name = "ERROR"
281 for file_extension in builder.file_extensions:
282- output_file_name = ('%s.%s' % (output_file_name_base,
283+ output_file_name = ('%s.%s' % (os.path.join(tmpdir, "output"),
284 file_extension))
285 if not os.access(output_file_name, os.F_OK):
286 continue
287diff --git a/tests/tests_libtesseract.py b/tests/tests_libtesseract.py
288index ccddd07..1ac2a4b 100644
289--- a/tests/tests_libtesseract.py
290+++ b/tests/tests_libtesseract.py
291@@ -33,8 +33,9 @@ class TestContext(unittest.TestCase):
292 (3, 3, 0),
293 (3, 4, 0),
294 (3, 4, 1),
295+ (3, 5, 0),
296 ), ("Tesseract does not have the expected version"
297- " (3.4.0) ! Some tests will be skipped !"))
298+ " (3.5.0) ! Some tests will be skipped !"))
299
300 def test_langs(self):
301 langs = libtesseract.get_available_languages()
302diff --git a/tests/tests_tesseract.py b/tests/tests_tesseract.py
303index e29c512..fa4d483 100644
304--- a/tests/tests_tesseract.py
305+++ b/tests/tests_tesseract.py
306@@ -27,8 +27,9 @@ class TestContext(unittest.TestCase):
307 (3, 3, 0),
308 (3, 4, 0),
309 (3, 4, 1),
310+ (3, 5, 0),
311 ), ("Tesseract does not have the expected version"
312- " (3.4.0) ! Some tests will be skipped !"))
313+ " (3.5.0) ! Some tests will be skipped !"))
314
315 def test_langs(self):
316 langs = tesseract.get_available_languages()