-
Notifications
You must be signed in to change notification settings - Fork 0
/
pytesser.py
114 lines (101 loc) · 4.31 KB
/
pytesser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
"""OCR in Python using the Tesseract engine from Google
http://code.google.com/p/pytesser/
by Michael J.T. O'Kelly
V 0.0.2, 5/26/08"""
import Image
import subprocess
import os
import StringIO
import util
import errors
tesseract_exe_name = 'dlltest' # Name of executable to be called at command line
scratch_image_name = "temp.bmp" # This file must be .bmp or other Tesseract-compatible format
scratch_text_name_root = "temp" # Leave out the .txt extension
_cleanup_scratch_flag = True # Temporary files cleaned up after OCR operation
_language = "" # Tesseract uses English if language is not given
_pagesegmode = "" # Tesseract uses fully automatic page segmentation if psm is not given (psm is available in v3.01)
_working_dir = os.getcwd()
def call_tesseract(input_filename, output_filename, language, pagesegmode):
"""Calls external tesseract.exe on input file (restrictions on types),
outputting output_filename+'txt'"""
current_dir = os.getcwd()
error_stream = StringIO.StringIO()
try:
os.chdir(_working_dir)
args = [tesseract_exe_name, input_filename, output_filename]
if len(language) > 0:
args.append("-l")
args.append(language)
if len(str(pagesegmode)) > 0:
args.append("-psm")
args.append(str(pagesegmode))
try:
proc = subprocess.Popen(args)
except (TypeError, AttributeError):
proc = subprocess.Popen(args, shell=True)
retcode = proc.wait()
if retcode!=0:
error_text = error_stream.getvalue()
errors.check_for_errors(error_stream_text = error_text)
finally: # Guarantee that we return to the original directory
error_stream.close()
os.chdir(current_dir)
def image_to_string(im, lang = _language, psm = _pagesegmode, cleanup = _cleanup_scratch_flag):
"""Converts im to file, applies tesseract, and fetches resulting text.
If cleanup=True, delete scratch files after operation."""
try:
util.image_to_scratch(im, scratch_image_name)
call_tesseract(scratch_image_name, scratch_text_name_root, lang, psm)
result = util.retrieve_result(scratch_text_name_root)
finally:
if cleanup:
util.perform_cleanup(scratch_image_name, scratch_text_name_root)
return result
def image_file_to_string(filename, lang = _language, psm = _pagesegmode, cleanup = _cleanup_scratch_flag, graceful_errors=True):
"""Applies tesseract to filename; or, if image is incompatible and graceful_errors=True,
converts to compatible format and then applies tesseract. Fetches resulting text.
If cleanup=True, delete scratch files after operation. Parameter lang specifies used language.
If lang is empty, English is used. Page segmentation mode parameter psm is available in Tesseract 3.01.
psm values are:
0 = Orientation and script detection (OSD) only.
1 = Automatic page segmentation with OSD.
2 = Automatic page segmentation, but no OSD, or OCR
3 = Fully automatic page segmentation, but no OSD. (Default)
4 = Assume a single column of text of variable sizes.
5 = Assume a single uniform block of vertically aligned text.
6 = Assume a single uniform block of text.
7 = Treat the image as a single text line.
8 = Treat the image as a single word.
9 = Treat the image as a single word in a circle.
10 = Treat the image as a single character."""
try:
try:
call_tesseract(filename, scratch_text_name_root, lang, psm)
result = util.retrieve_result(scratch_text_name_root)
except errors.Tesser_General_Exception:
if graceful_errors:
im = Image.open(filename)
result = image_to_string(im, cleanup)
else:
raise
finally:
if cleanup:
util.perform_cleanup(scratch_image_name, scratch_text_name_root)
return result
if __name__=='__main__':
im = Image.open('phototest.tif')
text = image_to_string(im, cleanup=False)
print text
text = image_to_string(im, psm=2, cleanup=False)
print text
try:
text = image_file_to_string('fnord.tif', graceful_errors=False)
except errors.Tesser_General_Exception, value:
print "fnord.tif is incompatible filetype. Try graceful_errors=True"
#print value
text = image_file_to_string('fnord.tif', graceful_errors=True, cleanup=False)
print "fnord.tif contents:", text
text = image_file_to_string('fonts_test.png', graceful_errors=True)
print text
text = image_file_to_string('fonts_test.png', lang="eng", psm=4, graceful_errors=True)
print text