Skip to content

Commit

Permalink
workflows: timeouts refextract
Browse files Browse the repository at this point in the history
Worksaround inspirehep/refextract#26 by interrupting the running away
refextract process.

Signed-off-by: Samuele Kaplun <[email protected]>
  • Loading branch information
kaplun committed Jun 15, 2017
1 parent 3f6bdaa commit eb6013b
Show file tree
Hide file tree
Showing 3 changed files with 13 additions and 6 deletions.
16 changes: 10 additions & 6 deletions inspirehep/modules/workflows/tasks/actions.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@

from flask import current_app
from werkzeug import secure_filename
from timeout_decorator import TimeoutError

from inspirehep.modules.workflows.utils import (
get_pdf_in_workflow,
Expand Down Expand Up @@ -233,11 +234,14 @@ def _prepare_update_payload(obj, eng):
def refextract(obj, eng):
uri = get_pdf_in_workflow(obj)
if uri:
mapped_references = extract_references(uri)
if mapped_references:
obj.data['references'] = mapped_references
obj.log.info('Extracted %d references', len(mapped_references))
else:
obj.log.info('No references extracted')
try:
mapped_references = extract_references(uri)
if mapped_references:
obj.data['references'] = mapped_references
obj.log.info('Extracted %d references', len(mapped_references))
else:
obj.log.info('No references extracted')
except TimeoutError:
obj.log.error('Timeout when extracting references from the PDF')
else:
obj.log.error('Not able to download and process the PDF')
2 changes: 2 additions & 0 deletions inspirehep/modules/workflows/tasks/refextract.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
from __future__ import absolute_import, division, print_function

import json
from timeout_decorator import timeout

from refextract import extract_journal_reference, extract_references_from_file

Expand Down Expand Up @@ -84,6 +85,7 @@ def extract_journal_info(obj, eng):
obj.data["publication_info"] = new_publication_info


@timeout(5 * 60)
def extract_references(filepath):
"""Extract references from PDF and return in INSPIRE format."""
references = extract_references_from_file(
Expand Down
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,7 @@
'python-redis-lock~=3.2',
'backoff~=1.0,>=1.4.2',
'requests~=2.0,>=2.15.1',
'timeout-decorator~=0.0,>=0.3.3',
]

tests_require = [
Expand Down

0 comments on commit eb6013b

Please sign in to comment.