-
Notifications
You must be signed in to change notification settings - Fork 23
/
parsecv.py
executable file
·177 lines (135 loc) · 5.14 KB
/
parsecv.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
import urllib2
from urlparse import urlparse
from referenceparser import referenceparser
from mendeleyparser import mendeleyparser
import lxml.html as ET
from utils import ratelimit, jsonify, get_view_rate_limit, get_url
from flask import Flask, request
from werkzeug.datastructures import FileStorage
from pdfminer.pdfdocument import PDFParser, PDFDocument
from pdfminer.layout import LAParams, LTTextBox, LTTextLine
from pdfminer.converter import PDFPageAggregator
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfdevice import PDFDevice
app = Flask(__name__)
app.config.from_object('settings')
def extract_resource_from_request():
"""Extracts and returns a python file type object from POST field data."""
if not request.form and not request.files:
raise ValueError("Received no data.")
if request.form:
input_file = get_url(request.form["url"])
return input_file
else:
if not isinstance(request.files["file"], FileStorage):
raise ValueError("Invalid file type.")
return request.files["file"]
def is_pdf(resource):
"""Function to determine whether the input datatype is in PDF format."""
resource.seek(0)
magic_number = resource.read(4)
resource.seek(0)
if magic_number == "%PDF":
return True
else:
return False
def pdf_from_resource(resource):
"""
Builds PDF mining objects from input data.
This function attempts to open a PDF file for processing.
"""
parser = PDFParser(resource)
document = PDFDocument()
parser.set_document(document)
document.set_parser(parser)
document.initialize()
return document
def pdf_to_text(pdf):
"""
Takes pdfminer PDFDocument and converts to plaintext.
Returns a string.
"""
output = ""
# create PDFMiner objects for data extraction
rsrcmgr = PDFResourceManager()
device = PDFDevice(rsrcmgr)
interpreter = PDFPageInterpreter(rsrcmgr, device)
laparams = LAParams()
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
# iterate over all pages, select textbox objects and extract plaintext
for page in pdf.get_pages():
interpreter.process_page(page)
layout = device.get_result()
for element in layout:
if isinstance(element, LTTextBox) or isinstance(element, LTTextLine):
output += element.get_text()
return output
def html_to_plaintext(resource):
"""Takes a file object containing HTML and returns all text elements."""
data = ET.fromstring(resource.getvalue())
text = data.text_content()
return text
def parse_references(text):
return referenceparser.parse_plaintext(text)
def is_mendeley_profile(url):
purl = urlparse(url)
return purl.netloc.endswith("mendeley.com") and purl.path.startswith("/profiles")
@app.route('/parsecv/', methods=['POST'])
#@ratelimit(limit=app.config["REQUESTS_PER_MINUTE"], per=60)
@jsonify
def parse_request():
"""
Process HTTP requests with associated POST data.
Expected POST fields are:
file -- an attached PDF file
url -- full URL
"""
text = ""
need_parsing = 1
try:
if not request.form and not request.files:
raise ValueError("Received no data.")
if request.form:
if is_mendeley_profile(request.form["url"]):
text = mendeleyparser.parse_mendeley_html(request.form["url"])
need_parsing = 0
else:
input_file = get_url(request.form["url"])
text = html_to_plaintext(input_file)
else:
input_file = request.files["file"]
if is_pdf(input_file):
try:
pdf_file = pdf_from_resource(input_file)
except Exception, e:
return {"status": "error", "message": str(e)}
try:
text = pdf_to_text(pdf_file)
except Exception, e:
return {"status": "error", "message": str(e)}
else:
return {"status": "error", "message": "Unsupported file format."}
try:
if need_parsing:
references = parse_references(text)
else:
references = text
except Exception, e:
return {"status": "error", "message": str(e)}
except ValueError, e:
return {"status": "error", "message": str(e)}
except urllib2.HTTPError, e:
return {"status": "error", "message": str(e)}
return references
#@app.after_request
#def inject_x_rate_headers(response):
# limit = get_view_rate_limit()
# if limit and limit.send_x_headers:
# h = response.headers
# h.add('X-RateLimit-Remaining', str(limit.remaining))
# h.add('X-RateLimit-Limit', str(limit.limit))
# h.add('X-RateLimit-Reset', str(limit.reset))
# return response
if __name__ == '__main__':
app.run()