1commit cfc05af26b571e9ca09e9c709c0fb8934e9e46dd 2Author: Guillaume Girol <symphorien+git@xlumurb.eu> 3Date: Sat Aug 20 17:48:01 2022 +0200 4 5 Fix finding tesseract and cuneiform 6 7diff --git a/src/pyocr/cuneiform.py b/src/pyocr/cuneiform.py 8index 2e5b717..35647e2 100644 9--- a/src/pyocr/cuneiform.py 10+++ b/src/pyocr/cuneiform.py 11@@ -25,13 +25,9 @@ from . import builders 12 from .error import CuneiformError 13 14 15-# CHANGE THIS IF CUNEIFORM IS NOT IN YOUR PATH, OR IS NAMED DIFFERENTLY 16-CUNEIFORM_CMD = 'cuneiform' 17+CUNEIFORM_CMD = '@cuneiform@/bin/cuneiform' 18 19-CUNEIFORM_DATA_POSSIBLE_PATHS = [ 20- "/usr/local/share/cuneiform", 21- "/usr/share/cuneiform", 22-] 23+CUNEIFORM_DATA_POSSIBLE_PATHS = ['@cuneiform@/share/cuneiform'] 24 25 LANGUAGES_LINE_PREFIX = "Supported languages: " 26 LANGUAGES_SPLIT_RE = re.compile("[^a-z]") 27diff --git a/src/pyocr/libtesseract/tesseract_raw.py b/src/pyocr/libtesseract/tesseract_raw.py 28index 1edec8c..434a336 100644 29--- a/src/pyocr/libtesseract/tesseract_raw.py 30+++ b/src/pyocr/libtesseract/tesseract_raw.py 31@@ -2,7 +2,6 @@ import ctypes 32 import locale 33 import logging 34 import os 35-import sys 36 37 from ..error import TesseractError 38 39@@ -10,51 +9,16 @@ from ..error import TesseractError 40 logger = logging.getLogger(__name__) 41 42 TESSDATA_PREFIX = os.getenv('TESSDATA_PREFIX', None) 43-libnames = [] 44+if TESSDATA_PREFIX is None: 45+ TESSDATA_PREFIX = '@tesseract@/share/tessdata' 46+ os.environ['TESSDATA_PREFIX'] = TESSDATA_PREFIX 47+ 48+ 49 # 70 is the minimum credible dpi for tesseract and force it to compute an 50 # estimate of the image dpi 51 DPI_DEFAULT = 70 52 53- 54-if getattr(sys, 'frozen', False) and hasattr(sys, '_MEIPASS'): 55- # Pyinstaller integration 56- libnames += [os.path.join(sys._MEIPASS, "libtesseract-4.dll")] 57- libnames += [os.path.join(sys._MEIPASS, "libtesseract-3.dll")] 58- tessdata = os.path.join(sys._MEIPASS, "data") 59- if not os.path.exists(os.path.join(tessdata, "tessdata")): 60- logger.warning( 61- "Running from container, but no tessdata ({}) found !".format( 62- tessdata 63- ) 64- ) 65- else: 66- TESSDATA_PREFIX = os.path.join(tessdata, "tessdata") 67- 68- 69-if sys.platform[:3] == "win": # pragma: no cover 70- libnames += [ 71- # Jflesch> Don't they have the equivalent of LD_LIBRARY_PATH on 72- # Windows ? 73- "../vs2010/DLL_Release/libtesseract302.dll", 74- # prefer the most recent first 75- "libtesseract305.dll", 76- "libtesseract304.dll", 77- "libtesseract303.dll", 78- "libtesseract302.dll", 79- "libtesseract400.dll", # Tesseract 4 is still in alpha stage 80- "libtesseract.dll", 81- "C:\\Program Files (x86)\\Tesseract-OCR\\libtesseract-4.dll", 82- "C:\\Program Files (x86)\\Tesseract-OCR\\libtesseract-3.dll", 83- ] 84-else: 85- libnames += [ 86- "libtesseract.so.5", 87- "libtesseract.so.4", 88- "libtesseract.so.3", 89- "libtesseract.5.dylib", 90- "libtesseract.4.dylib", 91- ] 92- 93+libnames = [ "@tesseract@/lib/libtesseract.so" ] 94 95 g_libtesseract = None 96 97@@ -367,12 +331,12 @@ def init(lang=None): 98 try: 99 if lang: 100 lang = lang.encode("utf-8") 101- prefix = None 102- if TESSDATA_PREFIX: # pragma: no cover 103- prefix = TESSDATA_PREFIX.encode("utf-8") 104+ 105+ prefix = TESSDATA_PREFIX 106+ 107 g_libtesseract.TessBaseAPIInit3( 108 ctypes.c_void_p(handle), 109- ctypes.c_char_p(prefix), 110+ ctypes.c_char_p(prefix.encode('utf-8')), 111 ctypes.c_char_p(lang) 112 ) 113 g_libtesseract.TessBaseAPISetVariable( 114diff --git a/src/pyocr/tesseract.py b/src/pyocr/tesseract.py 115index 0fe0d20..c1fdd27 100644 116--- a/src/pyocr/tesseract.py 117+++ b/src/pyocr/tesseract.py 118@@ -28,8 +28,7 @@ from .builders import DigitBuilder # backward compatibility 119 from .error import TesseractError # backward compatibility 120 from .util import digits_only 121 122-# CHANGE THIS IF TESSERACT IS NOT IN YOUR PATH, OR IS NAMED DIFFERENTLY 123-TESSERACT_CMD = 'tesseract.exe' if os.name == 'nt' else 'tesseract' 124+TESSERACT_CMD = '@tesseract@/bin/tesseract' 125 126 TESSDATA_EXTENSION = ".traineddata" 127 128diff --git a/tests/test_cuneiform.py b/tests/test_cuneiform.py 129index b76e93c..266f6b2 100644 130--- a/tests/test_cuneiform.py 131+++ b/tests/test_cuneiform.py 132@@ -21,7 +21,7 @@ class TestCuneiform(BaseTest): 133 # XXX is it useful? 134 which.return_value = True 135 self.assertTrue(cuneiform.is_available()) 136- which.assert_called_once_with("cuneiform") 137+ which.assert_called_once_with("@cuneiform@/bin/cuneiform") 138 139 @patch("subprocess.Popen") 140 def test_version(self, popen): 141@@ -54,7 +54,7 @@ class TestCuneiform(BaseTest): 142 self.assertIn("eng", langs) 143 self.assertIn("fra", langs) 144 popen.assert_called_once_with( 145- ["cuneiform", "-l"], 146+ ["@cuneiform@/bin/cuneiform", "-l"], 147 stdout=subprocess.PIPE, stderr=subprocess.STDOUT 148 ) 149 150@@ -110,7 +110,7 @@ class TestCuneiformTxt(BaseTest): 151 output = cuneiform.image_to_string(self.image) 152 self.assertEqual(output, self._get_file_content("text").strip()) 153 popen.assert_called_once_with( 154- ["cuneiform", "-f", "text", "-o", self.tmp_filename, "-"], 155+ ["@cuneiform@/bin/cuneiform", "-f", "text", "-o", self.tmp_filename, "-"], 156 stdin=subprocess.PIPE, stdout=subprocess.PIPE, 157 stderr=subprocess.STDOUT 158 ) 159@@ -126,7 +126,7 @@ class TestCuneiformTxt(BaseTest): 160 builder=self.builder) 161 self.assertEqual(output, self._get_file_content("text").strip()) 162 popen.assert_called_once_with( 163- ["cuneiform", "-l", "fra", "-f", "text", "-o", self.tmp_filename, 164+ ["@cuneiform@/bin/cuneiform", "-l", "fra", "-f", "text", "-o", self.tmp_filename, 165 "-"], 166 stdin=subprocess.PIPE, stdout=subprocess.PIPE, 167 stderr=subprocess.STDOUT 168@@ -143,7 +143,7 @@ class TestCuneiformTxt(BaseTest): 169 builder=self.builder) 170 self.assertEqual(output, self._get_file_content("text").strip()) 171 popen.assert_called_once_with( 172- ["cuneiform", "-f", "text", "-o", self.tmp_filename, "-"], 173+ ["@cuneiform@/bin/cuneiform", "-f", "text", "-o", self.tmp_filename, "-"], 174 stdin=subprocess.PIPE, stdout=subprocess.PIPE, 175 stderr=subprocess.STDOUT 176 ) 177@@ -174,7 +174,7 @@ class TestCuneiformTxt(BaseTest): 178 output = cuneiform.image_to_string(image, builder=self.builder) 179 self.assertEqual(output, self._get_file_content("text").strip()) 180 popen.assert_called_once_with( 181- ["cuneiform", "-f", "text", "-o", self.tmp_filename, "-"], 182+ ["@cuneiform@/bin/cuneiform", "-f", "text", "-o", self.tmp_filename, "-"], 183 stdin=subprocess.PIPE, stdout=subprocess.PIPE, 184 stderr=subprocess.STDOUT 185 ) 186@@ -230,7 +230,7 @@ class TestCuneiformWordBox(BaseTest): 187 output = cuneiform.image_to_string(self.image, 188 builder=self.builder) 189 popen.assert_called_once_with( 190- ["cuneiform", "-f", "hocr", "-o", self.tmp_filename, "-"], 191+ ["@cuneiform@/bin/cuneiform", "-f", "hocr", "-o", self.tmp_filename, "-"], 192 stdin=subprocess.PIPE, stdout=subprocess.PIPE, 193 stderr=subprocess.STDOUT 194 ) 195@@ -284,7 +284,7 @@ class TestCuneiformLineBox(BaseTest): 196 output = cuneiform.image_to_string(self.image, 197 builder=self.builder) 198 popen.assert_called_once_with( 199- ["cuneiform", "-f", "hocr", "-o", self.tmp_filename, "-"], 200+ ["@cuneiform@/bin/cuneiform", "-f", "hocr", "-o", self.tmp_filename, "-"], 201 stdin=subprocess.PIPE, stdout=subprocess.PIPE, 202 stderr=subprocess.STDOUT 203 ) 204diff --git a/tests/test_libtesseract.py b/tests/test_libtesseract.py 205index cc31a50..890c02c 100644 206--- a/tests/test_libtesseract.py 207+++ b/tests/test_libtesseract.py 208@@ -167,7 +167,8 @@ class TestLibTesseractRaw(BaseTest): 209 args = libtess.TessBaseAPIInit3.call_args[0] 210 self.assertEqual(len(args), 3) 211 self.assertEqual(args[0].value, self.handle) 212- self.assertEqual(args[1].value, None) 213+ # we hardcode tesseract data, so we don't get None 214+ #self.assertEqual(args[1].value, None) 215 self.assertEqual(args[2].value, lang.encode() if lang else None) 216 217 self.assertEqual( 218@@ -203,7 +204,8 @@ class TestLibTesseractRaw(BaseTest): 219 args = libtess.TessBaseAPIInit3.call_args[0] 220 self.assertEqual(len(args), 3) 221 self.assertEqual(args[0].value, self.handle) 222- self.assertEqual(args[1].value, None) 223+ # we hardcode tesseract data, so we don't get None 224+ #self.assertEqual(args[1].value, None) 225 self.assertEqual(args[2].value, lang.encode() if lang else None) 226 227 self.assertEqual( 228diff --git a/tests/test_tesseract.py b/tests/test_tesseract.py 229index 823818f..2ee5fb4 100644 230--- a/tests/test_tesseract.py 231+++ b/tests/test_tesseract.py 232@@ -37,7 +37,7 @@ class TestTesseract(BaseTest): 233 def test_available(self, which): 234 which.return_value = True 235 self.assertTrue(tesseract.is_available()) 236- which.assert_called_once_with("tesseract") 237+ which.assert_called_once_with("@tesseract@/bin/tesseract") 238 239 @patch("subprocess.Popen") 240 def test_version_error(self, popen): 241@@ -163,7 +163,7 @@ class TestTesseract(BaseTest): 242 for lang in ("eng", "fra", "jpn", "osd"): 243 self.assertIn(lang, langs) 244 popen.assert_called_once_with( 245- ["tesseract", "--list-langs"], 246+ ["@tesseract@/bin/tesseract", "--list-langs"], 247 startupinfo=None, creationflags=0, 248 stdout=subprocess.PIPE, stderr=subprocess.STDOUT 249 ) 250@@ -178,7 +178,7 @@ class TestTesseract(BaseTest): 251 self.assertEqual(te.exception.status, 1) 252 self.assertEqual("unable to get languages", te.exception.message) 253 popen.assert_called_once_with( 254- ["tesseract", "--list-langs"], 255+ ["@tesseract@/bin/tesseract", "--list-langs"], 256 startupinfo=None, creationflags=0, 257 stdout=subprocess.PIPE, stderr=subprocess.STDOUT 258 ) 259@@ -255,7 +255,7 @@ class TestTesseract(BaseTest): 260 self.assertEqual(status, 0) 261 self.assertEqual(error, message) 262 popen.assert_called_once_with( 263- ["tesseract", "input.bmp", "output"], 264+ ["@tesseract@/bin/tesseract", "input.bmp", "output"], 265 cwd=tmpdir, 266 startupinfo=None, 267 creationflags=0, 268@@ -278,7 +278,7 @@ class TestTesseract(BaseTest): 269 self.assertEqual(status, 0) 270 self.assertEqual(error, message) 271 popen.assert_called_with( 272- ["tesseract", "input2.bmp", "output2", "-l", "fra", "--psm", "3"], 273+ ["@tesseract@/bin/tesseract", "input2.bmp", "output2", "-l", "fra", "--psm", "3"], 274 cwd=tmpdir, 275 startupinfo=None, 276 creationflags=0, 277@@ -309,7 +309,7 @@ class TestTesseract(BaseTest): 278 self.assertEqual(result["angle"], 90) 279 self.assertEqual(result["confidence"], 9.30) 280 popen.assert_called_once_with( 281- ["tesseract", "input.bmp", "stdout", "--psm", "0"], 282+ ["@tesseract@/bin/tesseract", "input.bmp", "stdout", "--psm", "0"], 283 stdin=subprocess.PIPE, 284 shell=False, 285 startupinfo=None, 286@@ -345,7 +345,7 @@ class TestTesseract(BaseTest): 287 self.assertEqual(result["angle"], 90) 288 self.assertEqual(result["confidence"], 9.30) 289 popen.assert_called_once_with( 290- ["tesseract", "input.bmp", "stdout", "--psm", "0"], 291+ ["@tesseract@/bin/tesseract", "input.bmp", "stdout", "--psm", "0"], 292 stdin=subprocess.PIPE, 293 shell=False, 294 startupinfo=None, 295@@ -378,7 +378,7 @@ class TestTesseract(BaseTest): 296 self.assertEqual(result["angle"], 90) 297 self.assertEqual(result["confidence"], 9.30) 298 popen.assert_called_once_with( 299- ["tesseract", "input.bmp", "stdout", 300+ ["@tesseract@/bin/tesseract", "input.bmp", "stdout", 301 "--psm", "0", "-l", "osd"], 302 stdin=subprocess.PIPE, 303 shell=False, 304@@ -406,7 +406,7 @@ class TestTesseract(BaseTest): 305 with self.assertRaises(tesseract.TesseractError) as te: 306 tesseract.detect_orientation(self.image) 307 popen.assert_called_once_with( 308- ["tesseract", "input.bmp", "stdout", "--psm", "0"], 309+ ["@tesseract@/bin/tesseract", "input.bmp", "stdout", "--psm", "0"], 310 stdin=subprocess.PIPE, 311 shell=False, 312 startupinfo=None, 313@@ -440,7 +440,7 @@ class TestTesseract(BaseTest): 314 with self.assertRaises(tesseract.TesseractError) as te: 315 tesseract.detect_orientation(self.image) 316 popen.assert_called_once_with( 317- ["tesseract", "input.bmp", "stdout", "--psm", "0"], 318+ ["@tesseract@/bin/tesseract", "input.bmp", "stdout", "--psm", "0"], 319 stdin=subprocess.PIPE, 320 shell=False, 321 startupinfo=None, 322@@ -474,7 +474,7 @@ class TestTesseract(BaseTest): 323 self.assertEqual(result["angle"], 90) 324 self.assertEqual(result["confidence"], 9.30) 325 popen.assert_called_once_with( 326- ["tesseract", "input.bmp", "stdout", "-psm", "0"], 327+ ["@tesseract@/bin/tesseract", "input.bmp", "stdout", "-psm", "0"], 328 stdin=subprocess.PIPE, 329 shell=False, 330 startupinfo=None, 331@@ -507,7 +507,7 @@ class TestTesseract(BaseTest): 332 self.assertEqual(result["angle"], 90) 333 self.assertEqual(result["confidence"], 9.30) 334 popen.assert_called_once_with( 335- ["tesseract", "input.bmp", "stdout", "-psm", "0", "-l", "fra"], 336+ ["@tesseract@/bin/tesseract", "input.bmp", "stdout", "-psm", "0", "-l", "fra"], 337 stdin=subprocess.PIPE, 338 shell=False, 339 startupinfo=None, 340@@ -534,7 +534,7 @@ class TestTesseract(BaseTest): 341 with self.assertRaises(tesseract.TesseractError) as te: 342 tesseract.detect_orientation(self.image) 343 popen.assert_called_once_with( 344- ["tesseract", "input.bmp", "stdout", "-psm", "0"], 345+ ["@tesseract@/bin/tesseract", "input.bmp", "stdout", "-psm", "0"], 346 stdin=subprocess.PIPE, 347 shell=False, 348 startupinfo=None, 349@@ -568,7 +568,7 @@ class TestTesseract(BaseTest): 350 with self.assertRaises(tesseract.TesseractError) as te: 351 tesseract.detect_orientation(self.image) 352 popen.assert_called_once_with( 353- ["tesseract", "input.bmp", "stdout", "-psm", "0"], 354+ ["@tesseract@/bin/tesseract", "input.bmp", "stdout", "-psm", "0"], 355 stdin=subprocess.PIPE, 356 shell=False, 357 startupinfo=None,