-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpdf_to_txt_argv.py
32 lines (28 loc) · 1013 Bytes
/
pdf_to_txt_argv.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
import os
import sys
from PyPDF2 import PdfReader
def extract_text_from_pdf(pdf_path):
pdf_reader = PdfReader(pdf_path)
text = ''
for page in pdf_reader.pages:
text += page.extract_text()
return text
def save_text_to_file(file_path, text):
text_file = open(file_path, 'w')
text_file.write(text)
text_file.close()
def process_pdf_files_in_dir(directory):
for filename in os.listdir(directory):
if filename.endswith(".pdf"):
print(f"Processing file: {filename}...")
pdf_path = os.path.join(directory, filename)
text = extract_text_from_pdf(pdf_path)
txt_filename = f"{os.path.splitext(filename)[0]}.txt"
txt_path = os.path.join(directory, txt_filename)
save_text_to_file(txt_path, text)
if __name__ == "__main__":
if len(sys.argv) != 2:
print("Usage: python pdf_to_txt_argv.py <directory>")
sys.exit(1)
directory = sys.argv[1]
process_pdf_files_in_dir(directory)