diff --git a/inspirehep/modules/workflows/tasks/actions.py b/inspirehep/modules/workflows/tasks/actions.py index 59e097a724..e39743b820 100644 --- a/inspirehep/modules/workflows/tasks/actions.py +++ b/inspirehep/modules/workflows/tasks/actions.py @@ -92,7 +92,6 @@ delete_empty_key ) from inspirehep.modules.workflows.utils.grobid_authors_parser import GrobidAuthors -from inspirehep.utils.normalizers import normalize_journal_title from inspirehep.utils.url import is_pdf_link @@ -611,6 +610,26 @@ def preserve_root(obj, eng): obj.save() +@backoff.on_exception(backoff.expo, (BadGatewayError, requests.exceptions.ConnectionError), base=4, max_tries=5) +def _get_all_journal_titles_to_normalize(obj_data): + """Get all journal titles to normalize.""" + + publication_journal_titles = get_value(obj_data, 'publication_info.journal_title', []) + references_journal_titles = get_value(obj_data, 'references.reference.publication_info.journal_title', []) + all_titles_to_normalize = publication_journal_titles + references_journal_titles + + response = requests.get( + "{inspirehep_url}/curation/literature/normalize-journal-titles".format( + inspirehep_url=current_app.config["INSPIREHEP_URL"] + ), + headers=_get_headers_for_hep_root_table_request(), + data=json.dumps({'journal_titles_list': all_titles_to_normalize}) + ) + response.raise_for_status() + normalized_journal_titles_mapping = response.json()['normalized_journal_titles'] + return normalized_journal_titles_mapping + + @with_debug_logging def normalize_journal_titles(obj, eng): """Normalize the journal titles @@ -632,27 +651,29 @@ def normalize_journal_titles(obj, eng): Returns: None """ + normalized_journal_titles_mapping = _get_all_journal_titles_to_normalize(obj.data) publications = obj.data.get('publication_info', []) for publication in publications: - normalize_journal_title_entry(obj, publication, add_inspire_categories=True) + if 'journal_title' not in publication: + continue + normalized_journal_title = normalized_journal_titles_mapping[publication['journal_title']] + normalize_journal_title_entry(obj, publication, normalized_journal_title, add_inspire_categories=True) references = obj.data.get("references", []) for reference in references: - publication_info = get_value(reference, 'reference.publication_info') - if not publication_info: + publication_info = get_value(reference, 'reference.publication_info', {}) + journal_title = publication_info.get('journal_title') + if not journal_title: continue - normalize_journal_title_entry(obj, publication_info) + normalized_joutnal_title = normalized_journal_titles_mapping[journal_title] + normalize_journal_title_entry(obj, publication_info, normalized_joutnal_title) if obj.extra_data.get('journal_inspire_categories'): obj.extra_data['journal_inspire_categories'] = dedupe_list(obj.extra_data['journal_inspire_categories']) -def normalize_journal_title_entry(obj, publication_info, add_inspire_categories=False): - if 'journal_title' not in publication_info: - return - - normalized_title = normalize_journal_title(publication_info['journal_title']) +def normalize_journal_title_entry(obj, publication_info, normalized_title, add_inspire_categories=False): publication_info['journal_title'] = normalized_title ref_query = RecordMetadata.query.filter( diff --git a/tests/integration/workflows/conftest.py b/tests/integration/workflows/conftest.py index dbd6849565..9676961ded 100644 --- a/tests/integration/workflows/conftest.py +++ b/tests/integration/workflows/conftest.py @@ -290,6 +290,13 @@ def mocked_external_services(workflow_app): headers=_get_headers_for_hep_root_table_request(), status_code=200, ) + requests_mocker.register_uri( + "GET", + "http://web:8000/curation/literature/normalize-journal-titles", + json={"normalized_journal_titles": {}}, + headers=_get_headers_for_hep_root_table_request(), + status_code=200, + ) requests_mocker.register_uri( "POST", "{}/extract_references_from_url".format( diff --git a/tests/integration/workflows/test_article_workflow.py b/tests/integration/workflows/test_article_workflow.py index 1c43f494ab..251bee90af 100644 --- a/tests/integration/workflows/test_article_workflow.py +++ b/tests/integration/workflows/test_article_workflow.py @@ -98,10 +98,10 @@ def test_create_ticket_when_source_is_publishing( ticket_publishing_content = "content=Queue%3A+HEP_publishing" wf.continue_workflow() - assert ticket_publishing_content in mocked_external_services.request_history[4].text + assert ticket_publishing_content in mocked_external_services.request_history[5].text assert wf.extra_data["curation_ticket_id"] assert ( - mocked_external_services.request_history[4].url + mocked_external_services.request_history[5].url == "http://rt.inspire/ticket/new" ) @@ -132,10 +132,10 @@ def test_create_ticket_when_source_is_not_publishing( ticket_curation_content = "content=Queue%3A+HEP_curation" wf.continue_workflow() - assert ticket_curation_content in mocked_external_services.request_history[4].text + assert ticket_curation_content in mocked_external_services.request_history[5].text assert wf.extra_data["curation_ticket_id"] assert ( - mocked_external_services.request_history[4].url + mocked_external_services.request_history[5].url == "http://rt.inspire/ticket/new" ) @@ -174,6 +174,13 @@ def test_set_fermilab_collection_from_report_number( headers=_get_headers_for_hep_root_table_request(), status_code=200, ) + mock.register_uri( + "GET", + "http://web:8000/curation/literature/normalize-journal-titles", + json={"normalized_journal_titles": {}}, + headers=_get_headers_for_hep_root_table_request(), + status_code=200, + ) mock.register_uri( "GET", "{inspirehep_url}/matcher/exact-match".format( diff --git a/tests/integration/workflows/test_arxiv_merge.py b/tests/integration/workflows/test_arxiv_merge.py index d95c735b01..8ea23b5374 100644 --- a/tests/integration/workflows/test_arxiv_merge.py +++ b/tests/integration/workflows/test_arxiv_merge.py @@ -895,7 +895,7 @@ def test_conflict_creates_ticket( wf_id=wf.id ) - assert mocked_external_services.request_history[1].text.startswith( + assert mocked_external_services.request_history[2].text.startswith( expected_ticket ) assert wf.extra_data["conflict-ticket-id"] @@ -907,10 +907,10 @@ def test_conflict_creates_ticket( wf.continue_workflow() assert ( - mocked_external_services.request_history[2].url == expected_ticket_close_url + mocked_external_services.request_history[3].url == expected_ticket_close_url ) assert ( - mocked_external_services.request_history[2].text + mocked_external_services.request_history[3].text == "content=Status%3A+resolved" ) diff --git a/tests/integration/workflows/test_arxiv_workflow.py b/tests/integration/workflows/test_arxiv_workflow.py index 20a6f15919..b8384c41d6 100644 --- a/tests/integration/workflows/test_arxiv_workflow.py +++ b/tests/integration/workflows/test_arxiv_workflow.py @@ -461,6 +461,13 @@ def test_article_workflow_continues_when_record_is_valid( headers=_get_headers_for_hep_root_table_request(), status_code=200, ) + requests_mocker.register_uri( + "GET", + "http://web:8000/curation/literature/normalize-journal-titles", + json={"normalized_journal_titles": {}}, + headers=_get_headers_for_hep_root_table_request(), + status_code=200, + ) workflow_id = build_workflow(valid_record).id eng_uuid = start("article", object_id=workflow_id) @@ -514,6 +521,13 @@ def test_update_exact_matched_goes_trough_the_workflow( headers=_get_headers_for_hep_root_table_request(), status_code=200, ) + requests_mocker.register_uri( + "GET", + "http://web:8000/curation/literature/normalize-journal-titles", + json={"normalized_journal_titles": {}}, + headers=_get_headers_for_hep_root_table_request(), + status_code=200, + ) requests_mocker.register_uri( "GET", "{inspirehep_url}/matcher/fuzzy-match".format( @@ -611,6 +625,13 @@ def custom_continue_workflow(self, *args, **kwargs): headers=_get_headers_for_hep_root_table_request(), status_code=200, ) + requests_mocker.register_uri( + "GET", + "http://web:8000/curation/literature/normalize-journal-titles", + json={"normalized_journal_titles": {}}, + headers=_get_headers_for_hep_root_table_request(), + status_code=200, + ) workflow_id = build_workflow(record).id eng_uuid = start("article", object_id=workflow_id) @@ -672,6 +693,13 @@ def test_validation_error_callback_with_a_valid( headers=_get_headers_for_hep_root_table_request(), status_code=200, ) + requests_mocker.register_uri( + "GET", + "http://web:8000/curation/literature/normalize-journal-titles", + json={"normalized_journal_titles": {}}, + headers=_get_headers_for_hep_root_table_request(), + status_code=200, + ) workflow_id = build_workflow(valid_record).id eng_uuid = start("article", object_id=workflow_id) @@ -728,6 +756,13 @@ def test_validation_error_callback_with_validation_error( headers=_get_headers_for_hep_root_table_request(), status_code=200, ) + requests_mocker.register_uri( + "GET", + "http://web:8000/curation/literature/normalize-journal-titles", + json={"normalized_journal_titles": {}}, + headers=_get_headers_for_hep_root_table_request(), + status_code=200, + ) workflow_id = build_workflow(invalid_record).id @@ -789,6 +824,13 @@ def test_validation_error_callback_with_missing_worfklow( headers=_get_headers_for_hep_root_table_request(), status_code=200, ) + requests_mocker.register_uri( + "GET", + "http://web:8000/curation/literature/normalize-journal-titles", + json={"normalized_journal_titles": {}}, + headers=_get_headers_for_hep_root_table_request(), + status_code=200, + ) workflow_id = build_workflow(invalid_record).id eng_uuid = start("article", object_id=workflow_id) @@ -833,6 +875,14 @@ def test_validation_error_callback_with_malformed_with_invalid_types( headers=_get_headers_for_hep_root_table_request(), status_code=200, ) + requests_mocker.register_uri( + "GET", + "http://web:8000/curation/literature/normalize-journal-titles", + json={"normalized_journal_titles": {}}, + headers=_get_headers_for_hep_root_table_request(), + status_code=200, + ) + requests_mocker.register_uri( "GET", "{inspirehep_url}/matcher/fuzzy-match".format( @@ -1155,6 +1205,14 @@ def test_workflows_halts_on_multiple_exact_matches(workflow_app): headers=_get_headers_for_hep_root_table_request(), status_code=200, ) + requests_mocker.register_uri( + "GET", + "http://web:8000/curation/literature/normalize-journal-titles", + json={"normalized_journal_titles": {}}, + headers=_get_headers_for_hep_root_table_request(), + status_code=200, + ) + requests_mocker.register_uri( "GET", "{inspirehep_url}/matcher/fuzzy-match".format( @@ -1444,6 +1502,14 @@ def test_update_record_goes_through_api_version_of_store_record_without_issue( headers=_get_headers_for_hep_root_table_request(), status_code=200, ) + mocked_external_services.register_uri( + "GET", + "http://web:8000/curation/literature/normalize-journal-titles", + json={"normalized_journal_titles": {}}, + headers=_get_headers_for_hep_root_table_request(), + status_code=200, + ) + mocked_external_services.register_uri( "GET", "http://web:8000/curation/literature/affiliations-normalization", @@ -1538,6 +1604,13 @@ def test_update_record_goes_through_api_version_of_store_record_wrong_api_addres headers=_get_headers_for_hep_root_table_request(), status_code=200, ) + mocked_external_services.register_uri( + "GET", + "{inspirehep_url}/curation/literature/normalize-journal-titles".format(inspirehep_url=workflow_app.config["INSPIREHEP_URL"]), + json={"normalized_journal_titles": {}}, + headers=_get_headers_for_hep_root_table_request(), + status_code=200, + ) mocked_external_services.register_uri( "GET", "http://web:8000/curation/literature/affiliations-normalization", @@ -1618,6 +1691,16 @@ def test_update_record_goes_through_api_version_of_store_record_connection_timeo headers=_get_headers_for_hep_root_table_request(), status_code=200, ) + mocked_external_services.register_uri( + "GET", + "{inspirehep_url}/curation/literature/normalize-journal-titles".format( + inspirehep_url=workflow_app.config["INSPIREHEP_URL"] + ), + json={"normalized_journal_titles": {}}, + headers=_get_headers_for_hep_root_table_request(), + status_code=200, + ) + mocked_external_services.register_uri( "GET", "{inspirehep_url}/matcher/exact-match".format( @@ -1636,6 +1719,13 @@ def test_update_record_goes_through_api_version_of_store_record_connection_timeo headers=_get_headers_for_hep_root_table_request(), status_code=200, ) + mocked_external_services.register_uri( + "GET", + "http://web:8000/curation/literature/normalize-journal-titles", + json={"normalized_journal_titles": {}}, + headers=_get_headers_for_hep_root_table_request(), + status_code=200, + ) mocked_external_services.register_uri( "GET", "{inspirehep_url}/curation/literature/affiliations-normalization".format( @@ -1723,9 +1813,15 @@ def test_workflow_checks_affiliations_if_record_is_not_important( ): workflow_id = build_workflow(record).id start("article", object_id=workflow_id) - collections_in_record = filter( - lambda x: x.path == '/literature', - mocked_external_services.request_history).pop().json().get('_collections') + collections_in_record = ( + filter( + lambda x: x.path == "/literature", + mocked_external_services.request_history, + ) + .pop() + .json() + .get("_collections") + ) assert "CDS Hidden" in collections_in_record assert "HAL Hidden" in collections_in_record assert "Fermilab" in collections_in_record @@ -1780,10 +1876,14 @@ def test_workflow_do_not_changes_to_hidden_if_record_authors_do_not_have_interes wf.save() wf.continue_workflow(delayed=False) - collections_in_record = filter( - lambda x: x.path == '/literature', - mocked_external_services.request_history - ).pop().json().get('_collections') + collections_in_record = ( + filter( + lambda x: x.path == "/literature", mocked_external_services.request_history + ) + .pop() + .json() + .get("_collections") + ) assert "CDS Hidden" not in collections_in_record assert "HAL Hidden" not in collections_in_record @@ -1853,6 +1953,13 @@ def test_workflow_checks_affiliations_if_record_is_rejected_by_curator( headers=_get_headers_for_hep_root_table_request(), status_code=200, ) + mocked_external_services.register_uri( + "GET", + "http://web:8000/curation/literature/normalize-journal-titles", + json={"normalized_journal_titles": {}}, + headers=_get_headers_for_hep_root_table_request(), + status_code=200, + ) mocked_external_services.register_uri( "GET", "{inspirehep_url}/matcher/fuzzy-match".format( @@ -1877,10 +1984,14 @@ def test_workflow_checks_affiliations_if_record_is_rejected_by_curator( wf.save() wf.continue_workflow(delayed=False) - collections_in_record = filter( - lambda x: x.path == '/literature', - mocked_external_services.request_history - ).pop().json().get('_collections') + collections_in_record = ( + filter( + lambda x: x.path == "/literature", mocked_external_services.request_history + ) + .pop() + .json() + .get("_collections") + ) assert "CDS Hidden" in collections_in_record assert "HAL Hidden" in collections_in_record @@ -1944,7 +2055,7 @@ def test_grobid_extracts_authors_correctly( { u"raw_affiliations": [ { - "value": u"Department of Mathematics and Statistics, University of Prince Edward Island, 550 University Avenue, Charlottetown, PEI, Canada C1A 4P3." + u"value": u"Department of Mathematics and Statistics, University of Prince Edward Island, 550 University Avenue, Charlottetown, PEI, Canada C1A 4P3." } ], u"emails": [u"nsaad@upei.ca"], diff --git a/tests/integration/workflows/test_workflow_core_selection.py b/tests/integration/workflows/test_workflow_core_selection.py index 73b5301b2d..d539f51ce2 100644 --- a/tests/integration/workflows/test_workflow_core_selection.py +++ b/tests/integration/workflows/test_workflow_core_selection.py @@ -100,6 +100,13 @@ def test_core_selection_wf_starts_after_article_wf_when_no_core( headers=_get_headers_for_hep_root_table_request(), status_code=200, ) + mock.register_uri( + "GET", + "http://web:8000/curation/literature/normalize-journal-titles", + json={"normalized_journal_titles": {}}, + headers=_get_headers_for_hep_root_table_request(), + status_code=200, + ) mock.register_uri( "GET", "http://web:8000/curation/literature/affiliations-normalization", @@ -190,9 +197,9 @@ def test_core_selection_wf_starts_after_article_wf_when_no_core( expected_record_data = load_json_record("hep_record_no_core.json")["metadata"] expected_record_data["core"] = True - assert len(mock.request_history) == 7 + assert len(mock.request_history) == 8 # Check is record sent to HEP is correct (only core has changed) - assert mock.request_history[6].json() == expected_record_data + assert mock.request_history[7].json() == expected_record_data @mock.patch("inspirehep.modules.workflows.tasks.submission.send_robotupload") @@ -257,6 +264,13 @@ def test_core_selection_wf_is_not_created_when_wf_is_record_update( "revision_id": 3, }, ) + mock.register_uri( + "GET", + "http://web:8000/curation/literature/normalize-journal-titles", + json={"normalized_journal_titles": {}}, + headers=_get_headers_for_hep_root_table_request(), + status_code=200, + ) mock.register_uri( "GET", "http://web:8000/curation/literature/collaborations-normalization", @@ -371,6 +385,13 @@ def test_core_selection_wf_works_when_there_is_record_redirection_on_hep( "revision_id": 3, }, ) + mock.register_uri( + "GET", + "http://web:8000/curation/literature/normalize-journal-titles", + json={"normalized_journal_titles": {}}, + headers=_get_headers_for_hep_root_table_request(), + status_code=200, + ) mock.register_uri( "GET", "http://web:8000/curation/literature/collaborations-normalization", @@ -465,9 +486,9 @@ def test_core_selection_wf_works_when_there_is_record_redirection_on_hep( expected_record_data = load_json_record("hep_record_no_core.json")["metadata"] expected_record_data["core"] = True - assert len(mock.request_history) == 7 + assert len(mock.request_history) == 8 # Check is record sent to HEP is correct (only core has changed) - assert mock.request_history[6].json() == expected_record_data + assert mock.request_history[7].json() == expected_record_data @mock.patch("inspirehep.modules.workflows.tasks.submission.send_robotupload") @@ -535,6 +556,13 @@ def test_core_selection_wf_still_runs_when_there_is_core_on_hep_already( "revision_id": 3, }, ) + mock.register_uri( + "GET", + "http://web:8000/curation/literature/normalize-journal-titles", + json={"normalized_journal_titles": {}}, + headers=_get_headers_for_hep_root_table_request(), + status_code=200, + ) mock.register_uri( "GET", "http://web:8000/curation/literature/collaborations-normalization", @@ -599,8 +627,8 @@ def test_core_selection_wf_still_runs_when_there_is_core_on_hep_already( core_selection_wf = workflow_object_class.get(core_selection_wf_object_id) assert core_selection_wf.status == ObjectStatus.COMPLETED - assert len(mock.request_history) == 6 - assert mock.request_history[5].json() == expected_hep_record["metadata"] + assert len(mock.request_history) == 7 + assert mock.request_history[6].json() == expected_hep_record["metadata"] @mock.patch( @@ -670,6 +698,13 @@ def test_core_selection_wf_skipped_if_record_was_manually_approved( headers=_get_headers_for_hep_root_table_request(), status_code=200, ) + mock.register_uri( + "GET", + "http://web:8000/curation/literature/normalize-journal-titles", + json={"normalized_journal_titles": {}}, + headers=_get_headers_for_hep_root_table_request(), + status_code=200, + ) mock.register_uri( "GET", "{inspirehep_url}/matcher/exact-match".format( @@ -799,6 +834,13 @@ def test_core_selection_wf_removes_arxiv_core_categories_when_marked_as_non_core headers=_get_headers_for_hep_root_table_request(), status_code=200, ) + mock.register_uri( + "GET", + "http://web:8000/curation/literature/normalize-journal-titles", + json={"normalized_journal_titles": {}}, + headers=_get_headers_for_hep_root_table_request(), + status_code=200, + ) mock.register_uri( "GET", "{inspirehep_url}/matcher/exact-match".format( diff --git a/tests/integration/workflows/test_workflows_tasks_actions.py b/tests/integration/workflows/test_workflows_tasks_actions.py index 56abdbc942..62aa38f057 100644 --- a/tests/integration/workflows/test_workflows_tasks_actions.py +++ b/tests/integration/workflows/test_workflows_tasks_actions.py @@ -126,259 +126,318 @@ def insert_literature_in_db(workflow_app): def test_normalize_journal_titles_known_journals_with_ref( workflow_app, insert_journals_in_db ): - record = { - "_collections": ["Literature"], - "titles": ["A title"], - "document_type": ["book", "note", "report"], - "publication_info": [ - { - "journal_title": "A Test Journal1", - "journal_record": { - "$ref": "http://localhost:5000/api/journals/1936475" + with requests_mock.Mocker() as request_mocker: + request_mocker.register_uri( + "GET", + "http://web:8000/curation/literature/normalize-journal-titles", + json={"normalized_journal_titles": {"A Test Journal1": "Test.Jou.1", "Test.Jou.2": "Test.Jou.2"}}, + headers=_get_headers_for_hep_root_table_request(), + status_code=200, + ) + record = { + "_collections": ["Literature"], + "titles": ["A title"], + "document_type": ["book", "note", "report"], + "publication_info": [ + { + "journal_title": "A Test Journal1", + "journal_record": { + "$ref": "http://localhost:5000/api/journals/1936475" + }, }, - }, - {"cnum": "C01-01-01"}, - { - "journal_title": "Test.Jou.2", - "journal_record": { - "$ref": "http://localhost:5000/api/journals/1936476" + {"cnum": "C01-01-01"}, + { + "journal_title": "Test.Jou.2", + "journal_record": { + "$ref": "http://localhost:5000/api/journals/1936476" + }, }, - }, - ], - } + ], + } - obj = workflow_object_class.create(data=record, id_user=1, data_type="hep") + obj = workflow_object_class.create(data=record, id_user=1, data_type="hep") - normalize_journal_titles(obj, None) + normalize_journal_titles(obj, None) - assert obj.data["publication_info"][0]["journal_title"] == "Test.Jou.1" - assert obj.data["publication_info"][2]["journal_title"] == "Test.Jou.2" - assert obj.data["publication_info"][0]["journal_record"] == { - "$ref": "http://localhost:5000/api/journals/1936475" - } - assert obj.data["publication_info"][2]["journal_record"] == { - "$ref": "http://localhost:5000/api/journals/1936476" - } - assert len(obj.extra_data["journal_inspire_categories"]) == 2 - assert {"term": "Astrophysics"} in obj.extra_data["journal_inspire_categories"] - assert {"term": "Accelerators"} in obj.extra_data["journal_inspire_categories"] + assert obj.data["publication_info"][0]["journal_title"] == "Test.Jou.1" + assert obj.data["publication_info"][2]["journal_title"] == "Test.Jou.2" + assert obj.data["publication_info"][0]["journal_record"] == { + "$ref": "http://localhost:5000/api/journals/1936475" + } + assert obj.data["publication_info"][2]["journal_record"] == { + "$ref": "http://localhost:5000/api/journals/1936476" + } + assert len(obj.extra_data["journal_inspire_categories"]) == 2 + assert {"term": "Astrophysics"} in obj.extra_data["journal_inspire_categories"] + assert {"term": "Accelerators"} in obj.extra_data["journal_inspire_categories"] def test_normalize_journal_titles_known_journals_with_ref_from_variants( workflow_app, insert_journals_in_db ): - record = { - "_collections": ["Literature"], - "titles": ["A title"], - "document_type": ["book", "note", "report"], - "publication_info": [ - { - "journal_title": "A Test Journal1 Variant 2", - "journal_record": { - "$ref": "http://localhost:5000/api/journals/1936475" + with requests_mock.Mocker() as request_mocker: + request_mocker.register_uri( + "GET", + "http://web:8000/curation/literature/normalize-journal-titles", + json={"normalized_journal_titles": {"A Test Journal1 Variant 2": "Test.Jou.1", "A Test Journal2 Variant 3": "Test.Jou.2"}}, + headers=_get_headers_for_hep_root_table_request(), + status_code=200, + ) + record = { + "_collections": ["Literature"], + "titles": ["A title"], + "document_type": ["book", "note", "report"], + "publication_info": [ + { + "journal_title": "A Test Journal1 Variant 2", + "journal_record": { + "$ref": "http://localhost:5000/api/journals/1936475" + }, }, - }, - {"cnum": "C01-01-01"}, - { - "journal_title": "A Test Journal2 Variant 3", - "journal_record": { - "$ref": "http://localhost:5000/api/journals/1936476" + {"cnum": "C01-01-01"}, + { + "journal_title": "A Test Journal2 Variant 3", + "journal_record": { + "$ref": "http://localhost:5000/api/journals/1936476" + }, }, - }, - ], - } + ], + } - obj = workflow_object_class.create(data=record, id_user=1, data_type="hep") + obj = workflow_object_class.create(data=record, id_user=1, data_type="hep") - normalize_journal_titles(obj, None) + normalize_journal_titles(obj, None) - assert obj.data["publication_info"][0]["journal_title"] == "Test.Jou.1" - assert obj.data["publication_info"][2]["journal_title"] == "Test.Jou.2" - assert obj.data["publication_info"][0]["journal_record"] == { - "$ref": "http://localhost:5000/api/journals/1936475" - } - assert obj.data["publication_info"][2]["journal_record"] == { - "$ref": "http://localhost:5000/api/journals/1936476" - } - assert len(obj.extra_data["journal_inspire_categories"]) == 2 - assert {"term": "Astrophysics"} in obj.extra_data["journal_inspire_categories"] - assert {"term": "Accelerators"} in obj.extra_data["journal_inspire_categories"] + assert obj.data["publication_info"][0]["journal_title"] == "Test.Jou.1" + assert obj.data["publication_info"][2]["journal_title"] == "Test.Jou.2" + assert obj.data["publication_info"][0]["journal_record"] == { + "$ref": "http://localhost:5000/api/journals/1936475" + } + assert obj.data["publication_info"][2]["journal_record"] == { + "$ref": "http://localhost:5000/api/journals/1936476" + } + assert len(obj.extra_data["journal_inspire_categories"]) == 2 + assert {"term": "Astrophysics"} in obj.extra_data["journal_inspire_categories"] + assert {"term": "Accelerators"} in obj.extra_data["journal_inspire_categories"] def test_normalize_journal_titles_known_journals_no_ref( workflow_app, insert_journals_in_db ): - record = { - "_collections": ["Literature"], - "titles": ["A title"], - "document_type": ["book", "note", "report"], - "publication_info": [ - {"journal_title": "A Test Journal1"}, - {"cnum": "C01-01-01"}, - {"journal_title": "Test.Jou.2"}, - ], - } + with requests_mock.Mocker() as request_mocker: + request_mocker.register_uri( + "GET", + "http://web:8000/curation/literature/normalize-journal-titles", + json={"normalized_journal_titles": {"A Test Journal1": "Test.Jou.1", "Test.Jou.2": "Test.Jou.2"}}, + headers=_get_headers_for_hep_root_table_request(), + status_code=200, + ) - obj = workflow_object_class.create(data=record, id_user=1, data_type="hep") + record = { + "_collections": ["Literature"], + "titles": ["A title"], + "document_type": ["book", "note", "report"], + "publication_info": [ + {"journal_title": "A Test Journal1"}, + {"cnum": "C01-01-01"}, + {"journal_title": "Test.Jou.2"}, + ], + } + + obj = workflow_object_class.create(data=record, id_user=1, data_type="hep") - normalize_journal_titles(obj, None) + normalize_journal_titles(obj, None) - assert obj.data["publication_info"][0]["journal_title"] == "Test.Jou.1" - assert obj.data["publication_info"][2]["journal_title"] == "Test.Jou.2" - assert obj.data["publication_info"][0]["journal_record"] == { - "$ref": "http://localhost:5000/api/journals/1936475" - } - assert obj.data["publication_info"][2]["journal_record"] == { - "$ref": "http://localhost:5000/api/journals/1936476" - } - assert len(obj.extra_data["journal_inspire_categories"]) == 2 - assert {"term": "Astrophysics"} in obj.extra_data["journal_inspire_categories"] - assert {"term": "Accelerators"} in obj.extra_data["journal_inspire_categories"] + assert obj.data["publication_info"][0]["journal_title"] == "Test.Jou.1" + assert obj.data["publication_info"][2]["journal_title"] == "Test.Jou.2" + assert obj.data["publication_info"][0]["journal_record"] == { + "$ref": "http://localhost:5000/api/journals/1936475" + } + assert obj.data["publication_info"][2]["journal_record"] == { + "$ref": "http://localhost:5000/api/journals/1936476" + } + assert len(obj.extra_data["journal_inspire_categories"]) == 2 + assert {"term": "Astrophysics"} in obj.extra_data["journal_inspire_categories"] + assert {"term": "Accelerators"} in obj.extra_data["journal_inspire_categories"] def test_normalize_journal_titles_known_journals_wrong_ref( workflow_app, insert_journals_in_db ): - record = { - "_collections": ["Literature"], - "titles": ["A title"], - "document_type": ["book", "note", "report"], - "publication_info": [ - {"journal_title": "A Test Journal1", "journal_record": {"$ref": "wrong1"}}, - {"cnum": "C01-01-01"}, - {"journal_title": "Test.Jou.2", "journal_record": {"$ref": "wrong2"}}, - ], - } + with requests_mock.Mocker() as request_mocker: + request_mocker.register_uri( + "GET", + "http://web:8000/curation/literature/normalize-journal-titles", + json={"normalized_journal_titles": {"A Test Journal1": "Test.Jou.1", "Test.Jou.2": "Test.Jou.2"}}, + headers=_get_headers_for_hep_root_table_request(), + status_code=200, + ) + record = { + "_collections": ["Literature"], + "titles": ["A title"], + "document_type": ["book", "note", "report"], + "publication_info": [ + {"journal_title": "A Test Journal1", "journal_record": {"$ref": "wrong1"}}, + {"cnum": "C01-01-01"}, + {"journal_title": "Test.Jou.2", "journal_record": {"$ref": "wrong2"}}, + ], + } - obj = workflow_object_class.create(data=record, id_user=1, data_type="hep") + obj = workflow_object_class.create(data=record, id_user=1, data_type="hep") - normalize_journal_titles(obj, None) + normalize_journal_titles(obj, None) - assert obj.data["publication_info"][0]["journal_title"] == "Test.Jou.1" - assert obj.data["publication_info"][2]["journal_title"] == "Test.Jou.2" - assert obj.data["publication_info"][0]["journal_record"] == { - "$ref": "http://localhost:5000/api/journals/1936475" - } - assert obj.data["publication_info"][2]["journal_record"] == { - "$ref": "http://localhost:5000/api/journals/1936476" - } - assert len(obj.extra_data["journal_inspire_categories"]) == 2 - assert {"term": "Astrophysics"} in obj.extra_data["journal_inspire_categories"] - assert {"term": "Accelerators"} in obj.extra_data["journal_inspire_categories"] + assert obj.data["publication_info"][0]["journal_title"] == "Test.Jou.1" + assert obj.data["publication_info"][2]["journal_title"] == "Test.Jou.2" + assert obj.data["publication_info"][0]["journal_record"] == { + "$ref": "http://localhost:5000/api/journals/1936475" + } + assert obj.data["publication_info"][2]["journal_record"] == { + "$ref": "http://localhost:5000/api/journals/1936476" + } + assert len(obj.extra_data["journal_inspire_categories"]) == 2 + assert {"term": "Astrophysics"} in obj.extra_data["journal_inspire_categories"] + assert {"term": "Accelerators"} in obj.extra_data["journal_inspire_categories"] def test_normalize_journal_titles_unknown_journals_with_ref( workflow_app, insert_journals_in_db ): - record = { - "_collections": ["Literature"], - "titles": ["A title"], - "document_type": ["book", "note", "report"], - "publication_info": [ - { - "journal_title": "Unknown1", - "journal_record": { - "$ref": "http://localhost:5000/api/journals/0000000" + with requests_mock.Mocker() as request_mocker: + request_mocker.register_uri( + "GET", + "http://web:8000/curation/literature/normalize-journal-titles", + json={"normalized_journal_titles": {"Unknown1": "Unknown1", "Unknown2": "Unknown2"}}, + headers=_get_headers_for_hep_root_table_request(), + status_code=200, + ) + record = { + "_collections": ["Literature"], + "titles": ["A title"], + "document_type": ["book", "note", "report"], + "publication_info": [ + { + "journal_title": "Unknown1", + "journal_record": { + "$ref": "http://localhost:5000/api/journals/0000000" + }, }, - }, - {"cnum": "C01-01-01"}, - { - "journal_title": "Unknown2", - "journal_record": { - "$ref": "http://localhost:5000/api/journals/1111111" + {"cnum": "C01-01-01"}, + { + "journal_title": "Unknown2", + "journal_record": { + "$ref": "http://localhost:5000/api/journals/1111111" + }, }, - }, - ], - } + ], + } - obj = workflow_object_class.create(data=record, id_user=1, data_type="hep") + obj = workflow_object_class.create(data=record, id_user=1, data_type="hep") - normalize_journal_titles(obj, None) + normalize_journal_titles(obj, None) - assert obj.data["publication_info"][0]["journal_title"] == "Unknown1" - assert obj.data["publication_info"][2]["journal_title"] == "Unknown2" - assert obj.data["publication_info"][0]["journal_record"] == { - "$ref": "http://localhost:5000/api/journals/0000000" - } - assert obj.data["publication_info"][2]["journal_record"] == { - "$ref": "http://localhost:5000/api/journals/1111111" - } - assert not obj.extra_data.get("journal_inspire_categories") + assert obj.data["publication_info"][0]["journal_title"] == "Unknown1" + assert obj.data["publication_info"][2]["journal_title"] == "Unknown2" + assert obj.data["publication_info"][0]["journal_record"] == { + "$ref": "http://localhost:5000/api/journals/0000000" + } + assert obj.data["publication_info"][2]["journal_record"] == { + "$ref": "http://localhost:5000/api/journals/1111111" + } + assert not obj.extra_data.get("journal_inspire_categories") def test_normalize_journal_titles_unknown_journals_no_ref( workflow_app, insert_journals_in_db ): - record = { - "_collections": ["Literature"], - "titles": ["A title"], - "document_type": ["book", "note", "report"], - "publication_info": [ - {"journal_title": "Unknown1"}, - {"cnum": "C01-01-01"}, - {"journal_title": "Unknown2"}, - ], - } + with requests_mock.Mocker() as request_mocker: + request_mocker.register_uri( + "GET", + "http://web:8000/curation/literature/normalize-journal-titles", + json={"normalized_journal_titles": {"Unknown1": "Unknown1", "Unknown2": "Unknown2"}}, + headers=_get_headers_for_hep_root_table_request(), + status_code=200, + ) - obj = workflow_object_class.create(data=record, id_user=1, data_type="hep") + record = { + "_collections": ["Literature"], + "titles": ["A title"], + "document_type": ["book", "note", "report"], + "publication_info": [ + {"journal_title": "Unknown1"}, + {"cnum": "C01-01-01"}, + {"journal_title": "Unknown2"}, + ], + } + + obj = workflow_object_class.create(data=record, id_user=1, data_type="hep") - normalize_journal_titles(obj, None) + normalize_journal_titles(obj, None) - assert obj.data["publication_info"][0]["journal_title"] == "Unknown1" - assert obj.data["publication_info"][2]["journal_title"] == "Unknown2" - assert "journal_record" not in obj.data["publication_info"][0] - assert "journal_record" not in obj.data["publication_info"][2] - assert not obj.extra_data.get("journal_inspire_categories") + assert obj.data["publication_info"][0]["journal_title"] == "Unknown1" + assert obj.data["publication_info"][2]["journal_title"] == "Unknown2" + assert "journal_record" not in obj.data["publication_info"][0] + assert "journal_record" not in obj.data["publication_info"][2] + assert not obj.extra_data.get("journal_inspire_categories") def test_normalize_journal_titles_doesnt_assign_categories_from_journals_in_references( workflow_app, insert_journals_in_db ): - record = { - "_collections": ["Literature"], - "titles": ["A title"], - "document_type": ["book", "note", "report"], - "publication_info": [ - { - "journal_title": "A Test Journal1", - "journal_record": { - "$ref": "http://localhost:5000/api/journals/1936475" + with requests_mock.Mocker() as request_mocker: + request_mocker.register_uri( + "GET", + "http://web:8000/curation/literature/normalize-journal-titles", + json={"normalized_journal_titles": {"A Test Journal1": "Test.Jou.1", "Proc.Roy.Irish Acad.A": "Proc.Roy.Irish Acad.A"}}, + headers=_get_headers_for_hep_root_table_request(), + status_code=200, + ) + + record = { + "_collections": ["Literature"], + "titles": ["A title"], + "document_type": ["book", "note", "report"], + "publication_info": [ + { + "journal_title": "A Test Journal1", + "journal_record": { + "$ref": "http://localhost:5000/api/journals/1936475" + }, }, - }, - ], - "references": [ - { - "reference": { - "authors": [{"full_name": "A, Papaetrou"}], - "misc": [ - "A static solution of the equations of the gravitational field for an arbitrary charge distribution" - ], - "publication_info": { - "artid": "191", - "journal_record": { - "$ref": "http://localhost:5000/api/journals/1936476" + ], + "references": [ + { + "reference": { + "authors": [{"full_name": "A, Papaetrou"}], + "misc": [ + "A static solution of the equations of the gravitational field for an arbitrary charge distribution" + ], + "publication_info": { + "artid": "191", + "journal_record": { + "$ref": "http://localhost:5000/api/journals/1936476" + }, + "journal_title": "Proc.Roy.Irish Acad.A", + "journal_volume": "51", + "page_start": "191", + "year": 1947, }, - "journal_title": "Proc.Roy.Irish Acad.A", - "journal_volume": "51", - "page_start": "191", - "year": 1947, - }, + } } - } - ], - } + ], + } - obj = workflow_object_class.create(data=record, id_user=1, data_type="hep") + obj = workflow_object_class.create(data=record, id_user=1, data_type="hep") - normalize_journal_titles(obj, None) + normalize_journal_titles(obj, None) - assert obj.data["publication_info"][0]["journal_title"] == "Test.Jou.1" - assert obj.data["publication_info"][0]["journal_record"] == { - "$ref": "http://localhost:5000/api/journals/1936475" - } - assert len(obj.extra_data["journal_inspire_categories"]) == 1 - assert {"term": "Astrophysics"} in obj.extra_data["journal_inspire_categories"] - assert {"term": "Accelerators"} not in obj.extra_data["journal_inspire_categories"] + assert obj.data["publication_info"][0]["journal_title"] == "Test.Jou.1" + assert obj.data["publication_info"][0]["journal_record"] == { + "$ref": "http://localhost:5000/api/journals/1936475" + } + assert len(obj.extra_data["journal_inspire_categories"]) == 1 + assert {"term": "Astrophysics"} in obj.extra_data["journal_inspire_categories"] + assert {"term": "Accelerators"} not in obj.extra_data["journal_inspire_categories"] def test_update_inspire_categories(workflow_app): @@ -744,43 +803,52 @@ def test_replace_collection_to_hidden_sets_proper_hidden_collections_on_metadata def test_normalize_journal_titles_in_references(workflow_app, insert_journals_in_db): - record = { - "_collections": ["Literature"], - "titles": ["A title"], - "document_type": ["book", "note", "report"], - "references": [ - { - "reference": { - "publication_info": { - "journal_title": "A Test Journal1", + with requests_mock.Mocker() as request_mocker: + request_mocker.register_uri( + "GET", + "http://web:8000/curation/literature/normalize-journal-titles", + json={"normalized_journal_titles": {"A Test Journal1": "Test.Jou.1", "Something not in db": "Something not in db"}}, + headers=_get_headers_for_hep_root_table_request(), + status_code=200, + ) + + record = { + "_collections": ["Literature"], + "titles": ["A title"], + "document_type": ["book", "note", "report"], + "references": [ + { + "reference": { + "publication_info": { + "journal_title": "A Test Journal1", + } } - } - }, - { - "reference": { - "publication_info": { - "journal_title": "Something not in db", + }, + { + "reference": { + "publication_info": { + "journal_title": "Something not in db", + } } - } - }, - ], - } + }, + ], + } - obj = workflow_object_class.create(data=record, id_user=1, data_type="hep") + obj = workflow_object_class.create(data=record, id_user=1, data_type="hep") - normalize_journal_titles(obj, None) + normalize_journal_titles(obj, None) - assert ( - obj.data["references"][0]["reference"]["publication_info"]["journal_title"] - == "Test.Jou.1" - ) - assert obj.data["references"][0]["reference"]["publication_info"][ - "journal_record" - ] == {"$ref": "http://localhost:5000/api/journals/1936475"} - assert ( - obj.data["references"][1]["reference"]["publication_info"]["journal_title"] - == "Something not in db" - ) + assert ( + obj.data["references"][0]["reference"]["publication_info"]["journal_title"] + == "Test.Jou.1" + ) + assert obj.data["references"][0]["reference"]["publication_info"][ + "journal_record" + ] == {"$ref": "http://localhost:5000/api/journals/1936475"} + assert ( + obj.data["references"][1]["reference"]["publication_info"]["journal_title"] + == "Something not in db" + ) def test_normalize_collaborations(workflow_app): @@ -1229,6 +1297,13 @@ def test_core_selection_wf_already_created_show_created_wf( headers=_get_headers_for_hep_root_table_request(), status_code=200, ) + request_mocker.register_uri( + "GET", + "http://web:8000/curation/literature/normalize-journal-titles", + json={"normalized_journal_titles": {}}, + headers=_get_headers_for_hep_root_table_request(), + status_code=200, + ) request_mocker.register_uri( "GET", "{inspirehep_url}/matcher/exact-match".format(