From 4b71f9efadafdc029c412527cc633ddded4a877e Mon Sep 17 00:00:00 2001 From: Samuele Kaplun Date: Thu, 15 Jun 2017 14:56:52 +0200 Subject: [PATCH] workflows: add timeout to refextract task Times out the `refextract` task after 300 seconds to work around inspirehep/refextract#26, which would otherwise block a Celery worker indefinitely. Signed-off-by: Samuele Kaplun --- inspirehep/modules/workflows/tasks/actions.py | 16 ++++++++++------ inspirehep/modules/workflows/tasks/refextract.py | 3 +++ setup.py | 1 + 3 files changed, 14 insertions(+), 6 deletions(-) diff --git a/inspirehep/modules/workflows/tasks/actions.py b/inspirehep/modules/workflows/tasks/actions.py index 9fe2db3278..76efb28038 100644 --- a/inspirehep/modules/workflows/tasks/actions.py +++ b/inspirehep/modules/workflows/tasks/actions.py @@ -28,6 +28,7 @@ from flask import current_app from werkzeug import secure_filename +from timeout_decorator import TimeoutError from inspirehep.modules.workflows.utils import ( get_pdf_in_workflow, @@ -233,11 +234,14 @@ def _prepare_update_payload(obj, eng): def refextract(obj, eng): uri = get_pdf_in_workflow(obj) if uri: - mapped_references = extract_references(uri) - if mapped_references: - obj.data['references'] = mapped_references - obj.log.info('Extracted %d references', len(mapped_references)) - else: - obj.log.info('No references extracted') + try: + mapped_references = extract_references(uri) + if mapped_references: + obj.data['references'] = mapped_references + obj.log.info('Extracted %d references', len(mapped_references)) + else: + obj.log.info('No references extracted') + except TimeoutError: + obj.log.error('Timeout when extracting references from the PDF') else: obj.log.error('Not able to download and process the PDF') diff --git a/inspirehep/modules/workflows/tasks/refextract.py b/inspirehep/modules/workflows/tasks/refextract.py index f53b012f38..250185392e 100644 --- a/inspirehep/modules/workflows/tasks/refextract.py +++ b/inspirehep/modules/workflows/tasks/refextract.py @@ -26,6 +26,8 @@ import json +from timeout_decorator import timeout + from refextract import extract_journal_reference, extract_references_from_file from inspirehep.utils.helpers import maybe_int @@ -84,6 +86,7 @@ def extract_journal_info(obj, eng): obj.data["publication_info"] = new_publication_info +@timeout(5 * 60) def extract_references(filepath): """Extract references from PDF and return in INSPIRE format.""" references = extract_references_from_file( diff --git a/setup.py b/setup.py index f25af289af..715d875421 100644 --- a/setup.py +++ b/setup.py @@ -103,6 +103,7 @@ 'python-redis-lock~=3.2', 'backoff~=1.0,>=1.4.2', 'requests~=2.0,>=2.15.1', + 'timeout-decorator~=0.0,>=0.3.3', ] tests_require = [