Skip to content

Commit

Permalink
normailize journal titles: use hep request
Browse files Browse the repository at this point in the history
  • Loading branch information
MJedr committed Nov 10, 2023
1 parent 4604f3a commit 01d1484
Show file tree
Hide file tree
Showing 7 changed files with 523 additions and 260 deletions.
41 changes: 31 additions & 10 deletions inspirehep/modules/workflows/tasks/actions.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,6 @@
delete_empty_key
)
from inspirehep.modules.workflows.utils.grobid_authors_parser import GrobidAuthors
from inspirehep.utils.normalizers import normalize_journal_title
from inspirehep.utils.url import is_pdf_link


Expand Down Expand Up @@ -611,6 +610,26 @@ def preserve_root(obj, eng):
obj.save()


@backoff.on_exception(backoff.expo, (BadGatewayError, requests.exceptions.ConnectionError), base=4, max_tries=5)
def _get_all_journal_titles_to_normalize(obj_data):
"""Get all journal titles to normalize."""

publication_journal_titles = get_value(obj_data, 'publication_info.journal_title', [])
references_journal_titles = get_value(obj_data, 'references.reference.publication_info.journal_title', [])
all_titles_to_normalize = publication_journal_titles + references_journal_titles

response = requests.get(
"{inspirehep_url}/curation/literature/normalize-journal-titles".format(
inspirehep_url=current_app.config["INSPIREHEP_URL"]
),
headers=_get_headers_for_hep_root_table_request(),
data=json.dumps({'journal_titles_list': all_titles_to_normalize})
)
response.raise_for_status()
normalized_journal_titles_mapping = response.json()['normalized_journal_titles']
return normalized_journal_titles_mapping


@with_debug_logging
def normalize_journal_titles(obj, eng):
"""Normalize the journal titles
Expand All @@ -632,27 +651,29 @@ def normalize_journal_titles(obj, eng):
Returns:
None
"""
normalized_journal_titles_mapping = _get_all_journal_titles_to_normalize(obj.data)
publications = obj.data.get('publication_info', [])

for publication in publications:
normalize_journal_title_entry(obj, publication, add_inspire_categories=True)
if 'journal_title' not in publication:
continue
normalized_journal_title = normalized_journal_titles_mapping[publication['journal_title']]
normalize_journal_title_entry(obj, publication, normalized_journal_title, add_inspire_categories=True)

references = obj.data.get("references", [])
for reference in references:
publication_info = get_value(reference, 'reference.publication_info')
if not publication_info:
publication_info = get_value(reference, 'reference.publication_info', {})
journal_title = publication_info.get('journal_title')
if not journal_title:
continue
normalize_journal_title_entry(obj, publication_info)
normalized_joutnal_title = normalized_journal_titles_mapping[journal_title]
normalize_journal_title_entry(obj, publication_info, normalized_joutnal_title)

if obj.extra_data.get('journal_inspire_categories'):
obj.extra_data['journal_inspire_categories'] = dedupe_list(obj.extra_data['journal_inspire_categories'])


def normalize_journal_title_entry(obj, publication_info, add_inspire_categories=False):
if 'journal_title' not in publication_info:
return

normalized_title = normalize_journal_title(publication_info['journal_title'])
def normalize_journal_title_entry(obj, publication_info, normalized_title, add_inspire_categories=False):
publication_info['journal_title'] = normalized_title

ref_query = RecordMetadata.query.filter(
Expand Down
7 changes: 7 additions & 0 deletions tests/integration/workflows/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -290,6 +290,13 @@ def mocked_external_services(workflow_app):
headers=_get_headers_for_hep_root_table_request(),
status_code=200,
)
requests_mocker.register_uri(
"GET",
"http://web:8000/curation/literature/normalize-journal-titles",
json={"normalized_journal_titles": {}},
headers=_get_headers_for_hep_root_table_request(),
status_code=200,
)
requests_mocker.register_uri(
"POST",
"{}/extract_references_from_url".format(
Expand Down
15 changes: 11 additions & 4 deletions tests/integration/workflows/test_article_workflow.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,10 +98,10 @@ def test_create_ticket_when_source_is_publishing(
ticket_publishing_content = "content=Queue%3A+HEP_publishing"
wf.continue_workflow()

assert ticket_publishing_content in mocked_external_services.request_history[4].text
assert ticket_publishing_content in mocked_external_services.request_history[5].text
assert wf.extra_data["curation_ticket_id"]
assert (
mocked_external_services.request_history[4].url
mocked_external_services.request_history[5].url
== "http://rt.inspire/ticket/new"
)

Expand Down Expand Up @@ -132,10 +132,10 @@ def test_create_ticket_when_source_is_not_publishing(
ticket_curation_content = "content=Queue%3A+HEP_curation"
wf.continue_workflow()

assert ticket_curation_content in mocked_external_services.request_history[4].text
assert ticket_curation_content in mocked_external_services.request_history[5].text
assert wf.extra_data["curation_ticket_id"]
assert (
mocked_external_services.request_history[4].url
mocked_external_services.request_history[5].url
== "http://rt.inspire/ticket/new"
)

Expand Down Expand Up @@ -174,6 +174,13 @@ def test_set_fermilab_collection_from_report_number(
headers=_get_headers_for_hep_root_table_request(),
status_code=200,
)
mock.register_uri(
"GET",
"http://web:8000/curation/literature/normalize-journal-titles",
json={"normalized_journal_titles": {}},
headers=_get_headers_for_hep_root_table_request(),
status_code=200,
)
mock.register_uri(
"GET",
"{inspirehep_url}/matcher/exact-match".format(
Expand Down
6 changes: 3 additions & 3 deletions tests/integration/workflows/test_arxiv_merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -895,7 +895,7 @@ def test_conflict_creates_ticket(
wf_id=wf.id
)

assert mocked_external_services.request_history[1].text.startswith(
assert mocked_external_services.request_history[2].text.startswith(
expected_ticket
)
assert wf.extra_data["conflict-ticket-id"]
Expand All @@ -907,10 +907,10 @@ def test_conflict_creates_ticket(
wf.continue_workflow()

assert (
mocked_external_services.request_history[2].url == expected_ticket_close_url
mocked_external_services.request_history[3].url == expected_ticket_close_url
)
assert (
mocked_external_services.request_history[2].text
mocked_external_services.request_history[3].text
== "content=Status%3A+resolved"
)

Expand Down
Loading

0 comments on commit 01d1484

Please sign in to comment.