Skip to content

Commit

Permalink
workflows: fix errors with already downloaded documents
Browse files Browse the repository at this point in the history
  • Loading branch information
drjova committed Nov 13, 2024
1 parent 8df5366 commit 34558d0
Show file tree
Hide file tree
Showing 2 changed files with 50 additions and 1 deletion.
7 changes: 6 additions & 1 deletion inspirehep/modules/workflows/tasks/actions.py
Original file line number Diff line number Diff line change
Expand Up @@ -431,12 +431,17 @@ def download_documents(obj, eng):
LOGGER.info('Downloading documents for %s', obj.id)
documents = obj.data.get('documents', [])
for document in documents:
filename = document['key']
url = document['url']
if url.startswith('/api/files'): # this is a local file, no need to download
obj.log.info('Document already downloaded from %s', url)
continue

filename = document['key']
scheme = urlparse(url).scheme
LOGGER.info(
'Downloading document key:%s url:%s scheme:%s', document['key'], document['url'], scheme
)

if scheme == 'file':
downloaded = copy_file_to_workflow(obj, filename, url)
else:
Expand Down
44 changes: 44 additions & 0 deletions tests/unit/workflows/test_workflows_tasks_actions.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,50 @@ def test_download_documents():
assert expected_document_url == documents[0]['url']


def test_regression_download_documents_with_local_file_should_not_fail():
with requests_mock.Mocker() as requests_mocker:
requests_mocker.register_uri(
'GET', 'http://export.arxiv.org/pdf/1605.03844',
content=pkg_resources.resource_string(
__name__, os.path.join('fixtures', '1605.03844.pdf')),
)

schema = load_schema('hep')
subschema = schema['properties']['documents']

data = {
'documents': [
{
'key': '1605.03844.pdf',
'url': 'http://export.arxiv.org/pdf/1605.03844'
},
{
'original_url': 'http://export.arxiv.org/pdf/2308.04775',
'key': '2308.04775.pdf',
'url': '/api/files/c04581d8-b8b1-4b4e-9819-b17c63517ee7/2308.04775.pdf'
}
],
} # literature/1458302
extra_data = {}
files = MockFiles({})
assert validate(data['documents'], subschema) is None

obj = MockObj(data, extra_data, files=files)
eng = MockEng()

assert download_documents(obj, eng) is None

documents = obj.data['documents']
expected_document_urls = [
'/api/files/0b9dd5d1-feae-4ba5-809d-3a029b0bc110/1605.03844.pdf',
'/api/files/c04581d8-b8b1-4b4e-9819-b17c63517ee7/2308.04775.pdf'
]

assert 2 == len(documents)
assert expected_document_url == documents[0]['url']



def test_download_documents_with_multiple_documents():
with requests_mock.Mocker() as requests_mocker:
requests_mocker.register_uri(
Expand Down

0 comments on commit 34558d0

Please sign in to comment.