Skip to content

Commit

Permalink
workflows: add timeout to refextract task
Browse files Browse the repository at this point in the history
Times out the `refextract` task after 300 seconds to work around
inspirehep/refextract#26, which would otherwise block a Celery
worker indefinitely.

Signed-off-by: Samuele Kaplun <[email protected]>
  • Loading branch information
kaplun authored and jacquerie committed Jun 15, 2017
1 parent 3f6bdaa commit 4b71f9e
Show file tree
Hide file tree
Showing 3 changed files with 14 additions and 6 deletions.
16 changes: 10 additions & 6 deletions inspirehep/modules/workflows/tasks/actions.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@

from flask import current_app
from werkzeug import secure_filename
from timeout_decorator import TimeoutError

from inspirehep.modules.workflows.utils import (
get_pdf_in_workflow,
Expand Down Expand Up @@ -233,11 +234,14 @@ def _prepare_update_payload(obj, eng):
def refextract(obj, eng):
uri = get_pdf_in_workflow(obj)
if uri:
mapped_references = extract_references(uri)
if mapped_references:
obj.data['references'] = mapped_references
obj.log.info('Extracted %d references', len(mapped_references))
else:
obj.log.info('No references extracted')
try:
mapped_references = extract_references(uri)
if mapped_references:
obj.data['references'] = mapped_references
obj.log.info('Extracted %d references', len(mapped_references))
else:
obj.log.info('No references extracted')
except TimeoutError:
obj.log.error('Timeout when extracting references from the PDF')
else:
obj.log.error('Not able to download and process the PDF')
3 changes: 3 additions & 0 deletions inspirehep/modules/workflows/tasks/refextract.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@

import json

from timeout_decorator import timeout

from refextract import extract_journal_reference, extract_references_from_file

from inspirehep.utils.helpers import maybe_int
Expand Down Expand Up @@ -84,6 +86,7 @@ def extract_journal_info(obj, eng):
obj.data["publication_info"] = new_publication_info


@timeout(5 * 60)
def extract_references(filepath):
"""Extract references from PDF and return in INSPIRE format."""
references = extract_references_from_file(
Expand Down
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,7 @@
'python-redis-lock~=3.2',
'backoff~=1.0,>=1.4.2',
'requests~=2.0,>=2.15.1',
'timeout-decorator~=0.0,>=0.3.3',
]

tests_require = [
Expand Down

0 comments on commit 4b71f9e

Please sign in to comment.