3.ExtractingTextFromMultipleImagesMethodTwo

# Installation Instructions:
# Install the required Python libraries and system package:
# pip install pytesseract pillow pandas
# On Ubuntu/Debian: sudo apt install tesseract-ocr
# Ensure Python and these packages are installed before running the script.

import pytesseract
from PIL import Image
import pandas as pd
import os

# pytesseract: A Python wrapper for Google's Tesseract-OCR Engine.
# Pillow: A Python Imaging Library that supports opening, manipulating, and saving many image formats.
# pandas: A library for data manipulation and analysis.
# os: A module that provides a portable way of using operating system-dependent functionality.

# Function to extract text from multiple images and compile results into a DataFrame
def extract_texts(image_paths):
    extracted_texts = []
    filenames = []  
    for image_path in image_paths:
        with Image.open(image_path) as img:
            text = pytesseract.image_to_string(img)
            extracted_texts.append(text)
            filenames.append(image_path)  
    return pd.DataFrame({
        'filename': filenames,
        'text': extracted_texts
    })

# Specify the directory containing images
directory_path = '/path/to/your/image/directory'

# List all image files in the directory
image_paths = [os.path.join(directory_path, file) for file in os.listdir(directory_path) if file.endswith(('.png', '.jpg', '.jpeg'))]

# Explanation of the list comprehension:
# directory_path: Path to the directory where your images are stored.
# os.listdir(directory_path): Lists all files in the specified directory.
# os.path.join(directory_path, file): Joins the directory path with the file name to create a full path.
# file.endswith(('.png', '.jpg', '.jpeg')): Filters files to include only images.
# The result is stored in image_paths, which contains the paths to all image files in the directory.

# Extract text from the images and create a DataFrame
df = extract_texts(image_paths)

# Display the first few rows of the DataFrame
print(df.head())

# Save the DataFrame to a CSV file
df.to_csv('extracted_texts_final.csv')