-
Notifications
You must be signed in to change notification settings - Fork 0
/
3.ExtractingTextFromMultipleImagesMethodTwo
51 lines (42 loc) · 2.04 KB
/
3.ExtractingTextFromMultipleImagesMethodTwo
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
# Installation Instructions:
# Install the required Python libraries and system package:
# pip install pytesseract pillow pandas
# On Ubuntu/Debian: sudo apt install tesseract-ocr
# Ensure Python and these packages are installed before running the script.
import pytesseract
from PIL import Image
import pandas as pd
import os
# pytesseract: A Python wrapper for Google's Tesseract-OCR Engine.
# Pillow: A Python Imaging Library that supports opening, manipulating, and saving many image formats.
# pandas: A library for data manipulation and analysis.
# os: A module that provides a portable way of using operating system-dependent functionality.
# Function to extract text from multiple images and compile results into a DataFrame
def extract_texts(image_paths):
extracted_texts = []
filenames = []
for image_path in image_paths:
with Image.open(image_path) as img:
text = pytesseract.image_to_string(img)
extracted_texts.append(text)
filenames.append(image_path)
return pd.DataFrame({
'filename': filenames,
'text': extracted_texts
})
# Specify the directory containing images
directory_path = '/path/to/your/image/directory'
# List all image files in the directory
image_paths = [os.path.join(directory_path, file) for file in os.listdir(directory_path) if file.endswith(('.png', '.jpg', '.jpeg'))]
# Explanation of the list comprehension:
# directory_path: Path to the directory where your images are stored.
# os.listdir(directory_path): Lists all files in the specified directory.
# os.path.join(directory_path, file): Joins the directory path with the file name to create a full path.
# file.endswith(('.png', '.jpg', '.jpeg')): Filters files to include only images.
# The result is stored in image_paths, which contains the paths to all image files in the directory.
# Extract text from the images and create a DataFrame
df = extract_texts(image_paths)
# Display the first few rows of the DataFrame
print(df.head())
# Save the DataFrame to a CSV file
df.to_csv('extracted_texts_final.csv')