-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathapp.py
88 lines (74 loc) · 3.53 KB
/
app.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
import os
from docx import Document
import streamlit as st
from utils import create_or_empty_dir, convert_pdf_to_images, create_docx_with_text
extracted_images_dir = "extracted_images"
# Get the current directory
current_dir = os.path.dirname(os.path.abspath(__file__))
uploads_dir = os.path.join(current_dir, "uploads")
os.makedirs(uploads_dir, exist_ok=True)
converted_docx_dir = os.path.join(current_dir, "converted_docx")
os.makedirs(converted_docx_dir, exist_ok=True)
# Create a file uploader component
st.title("Scanned PDF to Word Converter")
# Add warning message
st.warning(
"⚠️ Important Notice: As you use this space, please be aware that any content you upload will be publicly accessible. We want to ensure your safety and security, so we kindly ask that you avoid sharing sensitive or private information. This includes personal data, confidential documents, or anything else you wouldn't want to be publicly visible. If you have any questions or concerns, don't hesitate to reach out."
)
st.write(
"Upload a scanned PDF file with images etc to convert it to an editable Word document."
)
uploaded_file = st.file_uploader("Upload a PDF file", type="pdf")
# Check if a file was uploaded
if uploaded_file is not None:
# Save the uploaded file to the uploads directory
with open(os.path.join(uploads_dir, uploaded_file.name), "wb") as file:
file.write(uploaded_file.getbuffer())
st.success("File uploaded successfully!")
else:
st.info("Please upload a PDF file.")
# Get a list of all PDF files in the uploads directory
pdf_files = [file for file in os.listdir(uploads_dir) if file.endswith(".pdf")]
# Create a column layout
col1, col2 = st.columns(2)
# Show checkboxes for each PDF file in col1
with col1:
selected_files = []
for file in pdf_files:
checkbox = st.checkbox(file)
if checkbox:
selected_files.append(file)
# Check if any files are selected
if selected_files:
# Create a button to trigger the conversion process
if st.button("Convert"):
# Create or empty the extracted_images directory
print(f"Creating or emptying the {extracted_images_dir} directory")
create_or_empty_dir(extracted_images_dir)
# Convert selected PDF files to images
for file in selected_files:
pdf_path = os.path.join(uploads_dir, file)
print(f"Converting {file} to images in {extracted_images_dir}")
convert_pdf_to_images(pdf_path, extracted_images_dir)
# Create a Word document with text extracted from images
output_docx = os.path.join(
converted_docx_dir, f'{file.replace(".pdf", "")}.docx'
)
image_folder = os.path.join(current_dir, extracted_images_dir)
print(
f'Creating {file.replace(".pdf", "")}.docx with text extracted from images in the {extracted_images_dir}'
)
create_docx_with_text(image_folder, output_docx)
st.success("Conversion completed successfully!")
# Show documents from the converted_docx folder in col2
with col2:
docx_files = [
file for file in os.listdir(converted_docx_dir) if file.endswith(".docx")
]
for file in docx_files:
st.download_button(
f"Download {file}",
open(os.path.join(converted_docx_dir, file), "rb").read(),
file_name=file,
mime="application/vnd.openxmlformats-officedocument.wordprocessingml.document",
)