2.ExtractingTextFromMultipleImagesMethodOne

# Installation Instructions:
# Install the required Python libraries and system package:
# pip install pytesseract pillow pandas
# On Ubuntu/Debian: sudo apt install tesseract-ocr
# Ensure Python and these packages are installed before running the script.

import pytesseract
from PIL import Image
import pandas as pd

# pytesseract: A Python wrapper for Google's Tesseract-OCR Engine. It enables the recognition and reading of text within images.
# Pillow: A Python Imaging Library that supports opening, manipulating, and saving many different image file formats.
# pandas: A powerful data manipulation library that we will use to create and manipulate dataframes for storing OCR results.

# Function to extract text from multiple images and return a DataFrame
def extract_texts(image_paths):
    extracted_texts = []
    filenames = []  
    for image_path in image_paths:
        with Image.open(image_path) as img:
            text = pytesseract.image_to_string(img)
            extracted_texts.append(text)
            filenames.append(image_path)
    return pd.DataFrame({
        'filename': filenames,
        'text': extracted_texts
    })

# List of image paths
image_paths = [
    "/content/badlayout1.1.png", 
    "/content/badlayout4.1.png", 
    "/content/badlayout4.5.png"
]

# Call the function and retrieve the DataFrame
df = extract_texts(image_paths)

# Display the DataFrame
print(df)

# Save the DataFrame to a CSV file
df.to_csv('extracted_texts.csv')