-
Notifications
You must be signed in to change notification settings - Fork 0
/
ocr.py
48 lines (31 loc) · 1.11 KB
/
ocr.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
import platform
from pathlib import Path
from tempfile import TemporaryDirectory
import pytesseract
from pdf2image import convert_from_path
from PIL import Image
import re
if platform.system() == "Windows":
pytesseract.pytesseract.tesseract_cmd = (r"D:\\Tesseract-OCR\\tesseract.exe")
pe_path = Path(r"C:\\Program Files\\poppler-0.68.0\bin")
def main(list_compare,find_word,PDF_file):
image_file_list = []
final_list = []
if platform.system() == "Windows":
pdf_pages = convert_from_path(
PDF_file, 500, poppler_path=pe_path
)
else:
pdf_pages = convert_from_path(PDF_file, 500)
for page_enumeration, page in enumerate(pdf_pages, start=1):
filename = f"D:\python_workspaces\{page_enumeration}.jpg"
page.save(filename, "JPEG")
image_file_list.append(filename)
for image_file in image_file_list:
text = str(((pytesseract.image_to_string(Image.open(image_file)))))
text = text.replace("-\n", "")
search = re.findall(fr'{find_word}',text)
final_list.append(len(search))
list_compare.append(sum(final_list))
if __name__ == "__main__":
main()