forked from aws-samples/amazon-textract-textractor
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtextractor.py
164 lines (133 loc) · 5.45 KB
/
textractor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
import sys
import os
from urllib.parse import urlparse
import boto3
import time
from tdp import DocumentProcessor
from og import OutputGenerator
from helper import FileHelper, S3Helper
class Textractor:
def getInputParameters(self, args):
event = {}
i = 0
if(args):
while(i < len(args)):
if(args[i] == '--documents'):
event['documents'] = args[i+1]
i = i + 1
if(args[i] == '--region'):
event['region'] = args[i+1]
i = i + 1
if(args[i] == '--text'):
event['text'] = True
if(args[i] == '--forms'):
event['forms'] = True
if(args[i] == '--tables'):
event['tables'] = True
if(args[i] == '--insights'):
event['insights'] = True
if(args[i] == '--medical-insights'):
event['medical-insights'] = True
if(args[i] == '--translate'):
event['translate'] = args[i+1]
i = i + 1
i = i + 1
return event
def validateInput(self, args):
event = self.getInputParameters(args)
ips = {}
if(not 'documents' in event):
raise Exception("Document or path to a foler or S3 bucket containing documents is required.")
inputDocument = event['documents']
idl = inputDocument.lower()
bucketName = None
documents = []
awsRegion = 'us-east-1'
if(idl.startswith("s3://")):
o = urlparse(inputDocument)
bucketName = o.netloc
path = o.path[1:]
ar = S3Helper.getS3BucketRegion(bucketName)
if(ar):
awsRegion = ar
if(idl.endswith("/")):
allowedFileTypes = ["jpg", "jpeg", "png", "pdf"]
documents = S3Helper.getFileNames(awsRegion, bucketName, path, 1, allowedFileTypes)
else:
documents.append(path)
else:
if(idl.endswith("/")):
allowedFileTypes = ["jpg", "jpeg", "png"]
documents = FileHelper.getFileNames(inputDocument, allowedFileTypes)
else:
documents.append(inputDocument)
if('region' in event):
awsRegion = event['region']
ips["bucketName"] = bucketName
ips["documents"] = documents
ips["awsRegion"] = awsRegion
ips["text"] = ('text' in event)
ips["forms"] = ('forms' in event)
ips["tables"] = ('tables' in event)
ips["insights"] = ('insights' in event)
ips["medical-insights"] = ('medical-insights' in event)
if("translate" in event):
ips["translate"] = event["translate"]
else:
ips["translate"] = ""
return ips
def processDocument(self, ips, i, document):
print("\nTextracting Document # {}: {}".format(i, document))
print('=' * (len(document)+30))
# Get document textracted
dp = DocumentProcessor(ips["bucketName"], document, ips["awsRegion"], ips["text"], ips["forms"], ips["tables"])
response = dp.run()
print("Recieved Textract response...")
#FileHelper.writeToFile("temp-response.json", json.dumps(response))
#Generate output files
print("Generating output...")
name, ext = FileHelper.getFileNameAndExtension(document)
opg = OutputGenerator(response,
"{}-{}".format(name, ext),
ips["forms"], ips["tables"])
opg.run()
if(ips["insights"] or ips["medical-insights"] or ips["translate"]):
opg.generateInsights(ips["insights"], ips["medical-insights"], ips["translate"], ips["awsRegion"])
print("{} textracted successfully.".format(document))
def printFormatException(self, e):
print("Invalid input: {}".format(e))
print("Valid format:")
print('- python3 textractor.py --documents mydoc.jpg --text --forms --tables --region us-east-1')
print('- python3 textractor.py --documents ./myfolder/ --text --forms --tables')
print('- python3 textractor.py --document s3://mybucket/mydoc.pdf --text --forms --tables')
print('- python3 textractor.py --document s3://mybucket/ --text --forms --tables')
def run(self):
ips = None
try:
ips = self.validateInput(sys.argv)
except Exception as e:
self.printFormatException(e)
#try:
i = 1
totalDocuments = len(ips["documents"])
print("\n")
print('*' * 60)
print("Total input documents: {}".format(totalDocuments))
print('*' * 60)
for document in ips["documents"]:
self.processDocument(ips, i, document)
remaining = len(ips["documents"])-i
if(remaining > 0):
print("\nRemaining documents: {}".format(remaining))
print("\nTaking a short break...")
time.sleep(20)
print("Allright, ready to go...\n")
i = i + 1
print("\n")
print('*' * 60)
print("Successfully textracted documents: {}".format(totalDocuments))
print('*' * 60)
print("\n")
#except Exception as e:
# print("Something went wrong:\n====================================================\n{}".format(e))
Textractor().run()