1commit cfc05af26b571e9ca09e9c709c0fb8934e9e46dd
2Author: Guillaume Girol <symphorien+git@xlumurb.eu>
3Date: Sat Aug 20 17:48:01 2022 +0200
4
5 Fix finding tesseract and cuneiform
6
7diff --git a/src/pyocr/cuneiform.py b/src/pyocr/cuneiform.py
8index 2e5b717..35647e2 100644
9--- a/src/pyocr/cuneiform.py
10+++ b/src/pyocr/cuneiform.py
11@@ -25,13 +25,9 @@ from . import builders
12 from .error import CuneiformError
13
14
15-# CHANGE THIS IF CUNEIFORM IS NOT IN YOUR PATH, OR IS NAMED DIFFERENTLY
16-CUNEIFORM_CMD = 'cuneiform'
17+CUNEIFORM_CMD = '@cuneiform@/bin/cuneiform'
18
19-CUNEIFORM_DATA_POSSIBLE_PATHS = [
20- "/usr/local/share/cuneiform",
21- "/usr/share/cuneiform",
22-]
23+CUNEIFORM_DATA_POSSIBLE_PATHS = ['@cuneiform@/share/cuneiform']
24
25 LANGUAGES_LINE_PREFIX = "Supported languages: "
26 LANGUAGES_SPLIT_RE = re.compile("[^a-z]")
27diff --git a/src/pyocr/libtesseract/tesseract_raw.py b/src/pyocr/libtesseract/tesseract_raw.py
28index 1edec8c..434a336 100644
29--- a/src/pyocr/libtesseract/tesseract_raw.py
30+++ b/src/pyocr/libtesseract/tesseract_raw.py
31@@ -2,7 +2,6 @@ import ctypes
32 import locale
33 import logging
34 import os
35-import sys
36
37 from ..error import TesseractError
38
39@@ -10,51 +9,16 @@ from ..error import TesseractError
40 logger = logging.getLogger(__name__)
41
42 TESSDATA_PREFIX = os.getenv('TESSDATA_PREFIX', None)
43-libnames = []
44+if TESSDATA_PREFIX is None:
45+ TESSDATA_PREFIX = '@tesseract@/share/tessdata'
46+ os.environ['TESSDATA_PREFIX'] = TESSDATA_PREFIX
47+
48+
49 # 70 is the minimum credible dpi for tesseract and force it to compute an
50 # estimate of the image dpi
51 DPI_DEFAULT = 70
52
53-
54-if getattr(sys, 'frozen', False) and hasattr(sys, '_MEIPASS'):
55- # Pyinstaller integration
56- libnames += [os.path.join(sys._MEIPASS, "libtesseract-4.dll")]
57- libnames += [os.path.join(sys._MEIPASS, "libtesseract-3.dll")]
58- tessdata = os.path.join(sys._MEIPASS, "data")
59- if not os.path.exists(os.path.join(tessdata, "tessdata")):
60- logger.warning(
61- "Running from container, but no tessdata ({}) found !".format(
62- tessdata
63- )
64- )
65- else:
66- TESSDATA_PREFIX = os.path.join(tessdata, "tessdata")
67-
68-
69-if sys.platform[:3] == "win": # pragma: no cover
70- libnames += [
71- # Jflesch> Don't they have the equivalent of LD_LIBRARY_PATH on
72- # Windows ?
73- "../vs2010/DLL_Release/libtesseract302.dll",
74- # prefer the most recent first
75- "libtesseract305.dll",
76- "libtesseract304.dll",
77- "libtesseract303.dll",
78- "libtesseract302.dll",
79- "libtesseract400.dll", # Tesseract 4 is still in alpha stage
80- "libtesseract.dll",
81- "C:\\Program Files (x86)\\Tesseract-OCR\\libtesseract-4.dll",
82- "C:\\Program Files (x86)\\Tesseract-OCR\\libtesseract-3.dll",
83- ]
84-else:
85- libnames += [
86- "libtesseract.so.5",
87- "libtesseract.so.4",
88- "libtesseract.so.3",
89- "libtesseract.5.dylib",
90- "libtesseract.4.dylib",
91- ]
92-
93+libnames = [ "@tesseract@/lib/libtesseract.so" ]
94
95 g_libtesseract = None
96
97@@ -367,12 +331,12 @@ def init(lang=None):
98 try:
99 if lang:
100 lang = lang.encode("utf-8")
101- prefix = None
102- if TESSDATA_PREFIX: # pragma: no cover
103- prefix = TESSDATA_PREFIX.encode("utf-8")
104+
105+ prefix = TESSDATA_PREFIX
106+
107 g_libtesseract.TessBaseAPIInit3(
108 ctypes.c_void_p(handle),
109- ctypes.c_char_p(prefix),
110+ ctypes.c_char_p(prefix.encode('utf-8')),
111 ctypes.c_char_p(lang)
112 )
113 g_libtesseract.TessBaseAPISetVariable(
114diff --git a/src/pyocr/tesseract.py b/src/pyocr/tesseract.py
115index 0fe0d20..c1fdd27 100644
116--- a/src/pyocr/tesseract.py
117+++ b/src/pyocr/tesseract.py
118@@ -28,8 +28,7 @@ from .builders import DigitBuilder # backward compatibility
119 from .error import TesseractError # backward compatibility
120 from .util import digits_only
121
122-# CHANGE THIS IF TESSERACT IS NOT IN YOUR PATH, OR IS NAMED DIFFERENTLY
123-TESSERACT_CMD = 'tesseract.exe' if os.name == 'nt' else 'tesseract'
124+TESSERACT_CMD = '@tesseract@/bin/tesseract'
125
126 TESSDATA_EXTENSION = ".traineddata"
127
128diff --git a/tests/test_cuneiform.py b/tests/test_cuneiform.py
129index b76e93c..266f6b2 100644
130--- a/tests/test_cuneiform.py
131+++ b/tests/test_cuneiform.py
132@@ -21,7 +21,7 @@ class TestCuneiform(BaseTest):
133 # XXX is it useful?
134 which.return_value = True
135 self.assertTrue(cuneiform.is_available())
136- which.assert_called_once_with("cuneiform")
137+ which.assert_called_once_with("@cuneiform@/bin/cuneiform")
138
139 @patch("subprocess.Popen")
140 def test_version(self, popen):
141@@ -54,7 +54,7 @@ class TestCuneiform(BaseTest):
142 self.assertIn("eng", langs)
143 self.assertIn("fra", langs)
144 popen.assert_called_once_with(
145- ["cuneiform", "-l"],
146+ ["@cuneiform@/bin/cuneiform", "-l"],
147 stdout=subprocess.PIPE, stderr=subprocess.STDOUT
148 )
149
150@@ -110,7 +110,7 @@ class TestCuneiformTxt(BaseTest):
151 output = cuneiform.image_to_string(self.image)
152 self.assertEqual(output, self._get_file_content("text").strip())
153 popen.assert_called_once_with(
154- ["cuneiform", "-f", "text", "-o", self.tmp_filename, "-"],
155+ ["@cuneiform@/bin/cuneiform", "-f", "text", "-o", self.tmp_filename, "-"],
156 stdin=subprocess.PIPE, stdout=subprocess.PIPE,
157 stderr=subprocess.STDOUT
158 )
159@@ -126,7 +126,7 @@ class TestCuneiformTxt(BaseTest):
160 builder=self.builder)
161 self.assertEqual(output, self._get_file_content("text").strip())
162 popen.assert_called_once_with(
163- ["cuneiform", "-l", "fra", "-f", "text", "-o", self.tmp_filename,
164+ ["@cuneiform@/bin/cuneiform", "-l", "fra", "-f", "text", "-o", self.tmp_filename,
165 "-"],
166 stdin=subprocess.PIPE, stdout=subprocess.PIPE,
167 stderr=subprocess.STDOUT
168@@ -143,7 +143,7 @@ class TestCuneiformTxt(BaseTest):
169 builder=self.builder)
170 self.assertEqual(output, self._get_file_content("text").strip())
171 popen.assert_called_once_with(
172- ["cuneiform", "-f", "text", "-o", self.tmp_filename, "-"],
173+ ["@cuneiform@/bin/cuneiform", "-f", "text", "-o", self.tmp_filename, "-"],
174 stdin=subprocess.PIPE, stdout=subprocess.PIPE,
175 stderr=subprocess.STDOUT
176 )
177@@ -174,7 +174,7 @@ class TestCuneiformTxt(BaseTest):
178 output = cuneiform.image_to_string(image, builder=self.builder)
179 self.assertEqual(output, self._get_file_content("text").strip())
180 popen.assert_called_once_with(
181- ["cuneiform", "-f", "text", "-o", self.tmp_filename, "-"],
182+ ["@cuneiform@/bin/cuneiform", "-f", "text", "-o", self.tmp_filename, "-"],
183 stdin=subprocess.PIPE, stdout=subprocess.PIPE,
184 stderr=subprocess.STDOUT
185 )
186@@ -230,7 +230,7 @@ class TestCuneiformWordBox(BaseTest):
187 output = cuneiform.image_to_string(self.image,
188 builder=self.builder)
189 popen.assert_called_once_with(
190- ["cuneiform", "-f", "hocr", "-o", self.tmp_filename, "-"],
191+ ["@cuneiform@/bin/cuneiform", "-f", "hocr", "-o", self.tmp_filename, "-"],
192 stdin=subprocess.PIPE, stdout=subprocess.PIPE,
193 stderr=subprocess.STDOUT
194 )
195@@ -284,7 +284,7 @@ class TestCuneiformLineBox(BaseTest):
196 output = cuneiform.image_to_string(self.image,
197 builder=self.builder)
198 popen.assert_called_once_with(
199- ["cuneiform", "-f", "hocr", "-o", self.tmp_filename, "-"],
200+ ["@cuneiform@/bin/cuneiform", "-f", "hocr", "-o", self.tmp_filename, "-"],
201 stdin=subprocess.PIPE, stdout=subprocess.PIPE,
202 stderr=subprocess.STDOUT
203 )
204diff --git a/tests/test_libtesseract.py b/tests/test_libtesseract.py
205index cc31a50..890c02c 100644
206--- a/tests/test_libtesseract.py
207+++ b/tests/test_libtesseract.py
208@@ -167,7 +167,8 @@ class TestLibTesseractRaw(BaseTest):
209 args = libtess.TessBaseAPIInit3.call_args[0]
210 self.assertEqual(len(args), 3)
211 self.assertEqual(args[0].value, self.handle)
212- self.assertEqual(args[1].value, None)
213+ # we hardcode tesseract data, so we don't get None
214+ #self.assertEqual(args[1].value, None)
215 self.assertEqual(args[2].value, lang.encode() if lang else None)
216
217 self.assertEqual(
218@@ -203,7 +204,8 @@ class TestLibTesseractRaw(BaseTest):
219 args = libtess.TessBaseAPIInit3.call_args[0]
220 self.assertEqual(len(args), 3)
221 self.assertEqual(args[0].value, self.handle)
222- self.assertEqual(args[1].value, None)
223+ # we hardcode tesseract data, so we don't get None
224+ #self.assertEqual(args[1].value, None)
225 self.assertEqual(args[2].value, lang.encode() if lang else None)
226
227 self.assertEqual(
228diff --git a/tests/test_tesseract.py b/tests/test_tesseract.py
229index 823818f..2ee5fb4 100644
230--- a/tests/test_tesseract.py
231+++ b/tests/test_tesseract.py
232@@ -37,7 +37,7 @@ class TestTesseract(BaseTest):
233 def test_available(self, which):
234 which.return_value = True
235 self.assertTrue(tesseract.is_available())
236- which.assert_called_once_with("tesseract")
237+ which.assert_called_once_with("@tesseract@/bin/tesseract")
238
239 @patch("subprocess.Popen")
240 def test_version_error(self, popen):
241@@ -163,7 +163,7 @@ class TestTesseract(BaseTest):
242 for lang in ("eng", "fra", "jpn", "osd"):
243 self.assertIn(lang, langs)
244 popen.assert_called_once_with(
245- ["tesseract", "--list-langs"],
246+ ["@tesseract@/bin/tesseract", "--list-langs"],
247 startupinfo=None, creationflags=0,
248 stdout=subprocess.PIPE, stderr=subprocess.STDOUT
249 )
250@@ -178,7 +178,7 @@ class TestTesseract(BaseTest):
251 self.assertEqual(te.exception.status, 1)
252 self.assertEqual("unable to get languages", te.exception.message)
253 popen.assert_called_once_with(
254- ["tesseract", "--list-langs"],
255+ ["@tesseract@/bin/tesseract", "--list-langs"],
256 startupinfo=None, creationflags=0,
257 stdout=subprocess.PIPE, stderr=subprocess.STDOUT
258 )
259@@ -255,7 +255,7 @@ class TestTesseract(BaseTest):
260 self.assertEqual(status, 0)
261 self.assertEqual(error, message)
262 popen.assert_called_once_with(
263- ["tesseract", "input.bmp", "output"],
264+ ["@tesseract@/bin/tesseract", "input.bmp", "output"],
265 cwd=tmpdir,
266 startupinfo=None,
267 creationflags=0,
268@@ -278,7 +278,7 @@ class TestTesseract(BaseTest):
269 self.assertEqual(status, 0)
270 self.assertEqual(error, message)
271 popen.assert_called_with(
272- ["tesseract", "input2.bmp", "output2", "-l", "fra", "--psm", "3"],
273+ ["@tesseract@/bin/tesseract", "input2.bmp", "output2", "-l", "fra", "--psm", "3"],
274 cwd=tmpdir,
275 startupinfo=None,
276 creationflags=0,
277@@ -309,7 +309,7 @@ class TestTesseract(BaseTest):
278 self.assertEqual(result["angle"], 90)
279 self.assertEqual(result["confidence"], 9.30)
280 popen.assert_called_once_with(
281- ["tesseract", "input.bmp", "stdout", "--psm", "0"],
282+ ["@tesseract@/bin/tesseract", "input.bmp", "stdout", "--psm", "0"],
283 stdin=subprocess.PIPE,
284 shell=False,
285 startupinfo=None,
286@@ -345,7 +345,7 @@ class TestTesseract(BaseTest):
287 self.assertEqual(result["angle"], 90)
288 self.assertEqual(result["confidence"], 9.30)
289 popen.assert_called_once_with(
290- ["tesseract", "input.bmp", "stdout", "--psm", "0"],
291+ ["@tesseract@/bin/tesseract", "input.bmp", "stdout", "--psm", "0"],
292 stdin=subprocess.PIPE,
293 shell=False,
294 startupinfo=None,
295@@ -378,7 +378,7 @@ class TestTesseract(BaseTest):
296 self.assertEqual(result["angle"], 90)
297 self.assertEqual(result["confidence"], 9.30)
298 popen.assert_called_once_with(
299- ["tesseract", "input.bmp", "stdout",
300+ ["@tesseract@/bin/tesseract", "input.bmp", "stdout",
301 "--psm", "0", "-l", "osd"],
302 stdin=subprocess.PIPE,
303 shell=False,
304@@ -406,7 +406,7 @@ class TestTesseract(BaseTest):
305 with self.assertRaises(tesseract.TesseractError) as te:
306 tesseract.detect_orientation(self.image)
307 popen.assert_called_once_with(
308- ["tesseract", "input.bmp", "stdout", "--psm", "0"],
309+ ["@tesseract@/bin/tesseract", "input.bmp", "stdout", "--psm", "0"],
310 stdin=subprocess.PIPE,
311 shell=False,
312 startupinfo=None,
313@@ -440,7 +440,7 @@ class TestTesseract(BaseTest):
314 with self.assertRaises(tesseract.TesseractError) as te:
315 tesseract.detect_orientation(self.image)
316 popen.assert_called_once_with(
317- ["tesseract", "input.bmp", "stdout", "--psm", "0"],
318+ ["@tesseract@/bin/tesseract", "input.bmp", "stdout", "--psm", "0"],
319 stdin=subprocess.PIPE,
320 shell=False,
321 startupinfo=None,
322@@ -474,7 +474,7 @@ class TestTesseract(BaseTest):
323 self.assertEqual(result["angle"], 90)
324 self.assertEqual(result["confidence"], 9.30)
325 popen.assert_called_once_with(
326- ["tesseract", "input.bmp", "stdout", "-psm", "0"],
327+ ["@tesseract@/bin/tesseract", "input.bmp", "stdout", "-psm", "0"],
328 stdin=subprocess.PIPE,
329 shell=False,
330 startupinfo=None,
331@@ -507,7 +507,7 @@ class TestTesseract(BaseTest):
332 self.assertEqual(result["angle"], 90)
333 self.assertEqual(result["confidence"], 9.30)
334 popen.assert_called_once_with(
335- ["tesseract", "input.bmp", "stdout", "-psm", "0", "-l", "fra"],
336+ ["@tesseract@/bin/tesseract", "input.bmp", "stdout", "-psm", "0", "-l", "fra"],
337 stdin=subprocess.PIPE,
338 shell=False,
339 startupinfo=None,
340@@ -534,7 +534,7 @@ class TestTesseract(BaseTest):
341 with self.assertRaises(tesseract.TesseractError) as te:
342 tesseract.detect_orientation(self.image)
343 popen.assert_called_once_with(
344- ["tesseract", "input.bmp", "stdout", "-psm", "0"],
345+ ["@tesseract@/bin/tesseract", "input.bmp", "stdout", "-psm", "0"],
346 stdin=subprocess.PIPE,
347 shell=False,
348 startupinfo=None,
349@@ -568,7 +568,7 @@ class TestTesseract(BaseTest):
350 with self.assertRaises(tesseract.TesseractError) as te:
351 tesseract.detect_orientation(self.image)
352 popen.assert_called_once_with(
353- ["tesseract", "input.bmp", "stdout", "-psm", "0"],
354+ ["@tesseract@/bin/tesseract", "input.bmp", "stdout", "-psm", "0"],
355 stdin=subprocess.PIPE,
356 shell=False,
357 startupinfo=None,