generated from streamlit/blank-app-template
-
Notifications
You must be signed in to change notification settings - Fork 0
/
streamlit_app.py
102 lines (84 loc) · 3.59 KB
/
streamlit_app.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
import streamlit as st
import pytesseract
from PIL import Image
import pandas as pd
import io
# Function for OCR processing
def extract_text_from_image(image):
"""
Extracts text from an image using pytesseract OCR.
Args:
image (PIL.Image): The input image from which text is to be extracted.
Returns:
str: The extracted text from the image.
"""
text = pytesseract.image_to_string(image)
return text
# Function to map entities to columns based on best match
def map_entities_to_columns(entities, columns):
"""
Automatically map detected entities to table columns.
Args:
entities (list): List of detected entities (strings).
columns (list): List of column names from the table structure.
Returns:
dict: Mapping of entities to columns.
"""
entity_to_column = {}
for i, entity in enumerate(entities):
# Example simple matching by index for demo purposes
column = columns[i % len(columns)] # Simple round-robin assignment
entity_to_column[entity] = column
return entity_to_column
# Example table structures
TABLE_STRUCTURES = {
"Receipts": ["Item", "Price", "Quantity", "Date", "Total"],
"Invoices": ["Invoice No.", "Date", "Customer", "Amount", "Due Date"]
}
# Streamlit App
st.title("Document Recognition and Table Mapping App")
# Step 1: Select table structure (Receipts/Invoices)
table_type = st.selectbox("Select Table Type", list(TABLE_STRUCTURES.keys()))
columns = TABLE_STRUCTURES[table_type]
st.write(f"Selected Table Structure: {columns}")
# Step 2: Upload or capture an image
image_file = st.file_uploader("Upload an image", type=["png", "jpg", "jpeg"])
if image_file is not None:
# Load the image
image = Image.open(image_file)
st.image(image, caption="Uploaded Image", use_column_width=True)
# Step 3: Perform OCR to extract text from the image
st.write("Extracting text from image...")
extracted_text = extract_text_from_image(image)
st.write("Detected Entities:")
st.text(extracted_text)
# Split detected entities into lines (for simplification)
detected_entities = extracted_text.splitlines()
detected_entities = [e for e in detected_entities if e.strip()] # Remove empty lines
st.write("Detected Entities List:")
st.write(detected_entities)
# Step 4: Auto-map entities to columns
st.write("Automatically Mapping Detected Entities to Table Columns...")
entity_column_mapping = map_entities_to_columns(detected_entities, columns)
st.write(entity_column_mapping)
# Step 5: Manual adjustment of entity to column mapping
st.write("Manually Adjust Entity to Column Mapping (if needed):")
for entity in detected_entities:
selected_column = st.selectbox(f"Select column for entity '{entity}'", columns, index=columns.index(entity_column_mapping[entity]))
entity_column_mapping[entity] = selected_column
# Display final mapping
st.write("Final Mapping of Entities to Columns:")
st.write(entity_column_mapping)
# Step 6: Preview the table with the mapped data
st.write("Preview Mapped Data Table:")
preview_data = {column: [] for column in columns}
for entity, column in entity_column_mapping.items():
preview_data[column].append(entity)
# Fill empty columns with None or an empty string
max_len = max(len(values) for values in preview_data.values())
for column in preview_data:
while len(preview_data[column]) < max_len:
preview_data[column].append(None)
# Create DataFrame to display
df = pd.DataFrame(preview_data)
st.write(df)