-
Notifications
You must be signed in to change notification settings - Fork 0
/
2.ExtractingTextFromMultipleImagesMethodOne
43 lines (36 loc) · 1.44 KB
/
2.ExtractingTextFromMultipleImagesMethodOne
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
# Installation Instructions:
# Install the required Python libraries and system package:
# pip install pytesseract pillow pandas
# On Ubuntu/Debian: sudo apt install tesseract-ocr
# Ensure Python and these packages are installed before running the script.
import pytesseract
from PIL import Image
import pandas as pd
# pytesseract: A Python wrapper for Google's Tesseract-OCR Engine. It enables the recognition and reading of text within images.
# Pillow: A Python Imaging Library that supports opening, manipulating, and saving many different image file formats.
# pandas: A powerful data manipulation library that we will use to create and manipulate dataframes for storing OCR results.
# Function to extract text from multiple images and return a DataFrame
def extract_texts(image_paths):
extracted_texts = []
filenames = []
for image_path in image_paths:
with Image.open(image_path) as img:
text = pytesseract.image_to_string(img)
extracted_texts.append(text)
filenames.append(image_path)
return pd.DataFrame({
'filename': filenames,
'text': extracted_texts
})
# List of image paths
image_paths = [
"/content/badlayout1.1.png",
"/content/badlayout4.1.png",
"/content/badlayout4.5.png"
]
# Call the function and retrieve the DataFrame
df = extract_texts(image_paths)
# Display the DataFrame
print(df)
# Save the DataFrame to a CSV file
df.to_csv('extracted_texts.csv')