-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #8 from hathitrust/DEV-1040-re-structureByservice
Dev 1040 re structure byservice
- Loading branch information
Showing
36 changed files
with
21,483 additions
and
1,172 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -60,6 +60,6 @@ WORKDIR /app | |
|
||
COPY . . | ||
|
||
CMD ["python3"] | ||
CMD ["tail", "-f", "/dev/null"] | ||
|
||
USER appuser |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,105 @@ | ||
import json | ||
|
||
from indexer_config import IDENTICAL_CATALOG_METADATA, RENAMED_CATALOG_METADATA | ||
|
||
|
||
class CatalogRecordMetadata: | ||
"""This class is used to retrieve the metadata of a specific item in the Catalog""" | ||
|
||
def __init__(self, record: dict): | ||
self.record = record | ||
self.metadata = self.get_metadata() | ||
|
||
def get_metadata(self) -> dict: | ||
|
||
"""Create a dictionary with the fulltext fields extracted from catalog metadata""" | ||
metadata = {} | ||
|
||
metadata.update(self.get_catalog_identical_fields()) | ||
metadata.update(self.rename_catalog_fields()) | ||
|
||
# Create bothPublishDate field | ||
if self.record.get("date") and self.record.get("enumPublishDate"): | ||
metadata.update({"bothPublishDate": self.record.get("enumPublishDate")}) | ||
|
||
return metadata | ||
|
||
def get_catalog_identical_fields(self) -> dict: | ||
|
||
"""Retrieve the fields that have identical names in the catalog and fulltext documents.""" | ||
entry = {} | ||
for field in IDENTICAL_CATALOG_METADATA: | ||
value = self.record.get(field) | ||
if value: | ||
entry[field] = value | ||
return entry | ||
|
||
def rename_catalog_fields(self) -> dict: | ||
"""Rename the fields from the catalog to the ones used in the fulltext documents.""" | ||
entry = {} | ||
for new_field in RENAMED_CATALOG_METADATA.keys(): | ||
catalog_field = RENAMED_CATALOG_METADATA[new_field] | ||
entry[new_field] = self.record.get(catalog_field) | ||
return entry | ||
|
||
|
||
class CatalogItemMetadata: | ||
"""This class is used to retrieve the metadata of a specific item in the Catalog""" | ||
|
||
def __init__(self, ht_id: str, record_metadata: CatalogRecordMetadata = None): | ||
|
||
self.record_metadata = record_metadata | ||
self.ht_id = ht_id | ||
metadata = self.get_metadata() | ||
|
||
# Merge both dictionaries | ||
self.metadata = {**self.record_metadata.metadata, **metadata} | ||
|
||
def get_volume_enumcron(self) -> list: | ||
try: | ||
return self.record_metadata.record.get("ht_id_display")[0].split("|")[2] | ||
except IndexError: | ||
return [] | ||
|
||
def get_metadata(self) -> dict: | ||
|
||
metadata = {} | ||
|
||
volume_enumcron = self.get_volume_enumcron() | ||
|
||
doc_json = self.get_data_ht_json_obj() | ||
|
||
if len(doc_json) > 0: | ||
metadata["enumPublishDate"] = doc_json[0].get("ht_json") | ||
|
||
if len(volume_enumcron) > 1: | ||
metadata["volume_enumcron"] = volume_enumcron | ||
metadata["htsource"] = self.get_item_htsource() | ||
|
||
metadata["vol_id"] = self.ht_id | ||
return metadata | ||
|
||
def get_data_ht_json_obj(self) -> list: | ||
"""Obtain the publication data of a specific item in the catalog.""" | ||
doc_json = [ | ||
item | ||
for item in json.loads(self.record_metadata.record.get("ht_json")) | ||
if (_v := item.get("enum_pubdate") and self.ht_id == item.get("htid")) | ||
] | ||
|
||
return doc_json | ||
|
||
def get_item_htsource(self) -> str: | ||
""" | ||
In catalog it could be a list of sources, should obtain the source of a specific item | ||
:param id: Catalod ht_id field | ||
:param catalog_htsource: catalog item source | ||
:param catalog_htid: catalog item ht_id | ||
:return: | ||
""" | ||
item_position = self.record_metadata.record.get("ht_id").index(self.ht_id) | ||
try: | ||
htsource = self.record_metadata.record.get("htsource")[item_position] | ||
except IndexError: | ||
htsource = self.record_metadata.record.get("htsource")[0] | ||
return htsource |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,127 @@ | ||
import inspect | ||
import json | ||
import os | ||
from copy import deepcopy | ||
|
||
import pytest | ||
|
||
from catalog_metadata.catalog_metadata import CatalogItemMetadata, CatalogRecordMetadata | ||
|
||
current = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe()))) | ||
|
||
|
||
# Retrieve JSON file to create a dictionary with a catalog record | ||
@pytest.fixture() | ||
def get_record_data(): | ||
with open(os.path.join(current, "data/catalog.json"), "r", ) as file: | ||
data = json.load(file) | ||
|
||
return data | ||
|
||
|
||
# Use the catalog record to create a CatalogRecordMetadata object | ||
@pytest.fixture() | ||
def get_catalog_record_metadata(get_record_data): | ||
return CatalogRecordMetadata(get_record_data) | ||
|
||
|
||
# Create a CatalogItemMetadata object with the catalog record and the ht_id of the item | ||
@pytest.fixture() | ||
def get_item_metadata(get_record_data: dict, get_catalog_record_metadata: CatalogRecordMetadata): | ||
return CatalogItemMetadata("mdp.39015078560292", get_catalog_record_metadata) | ||
|
||
|
||
# Update some fields of the catalog record to test some functions that retrieve specific fields | ||
@pytest.fixture() | ||
def update_catalog_record_metadata(get_record_data): | ||
new_record_data = deepcopy(get_record_data) | ||
new_record_data["htsource"] = ["University of Michigan", "Indiana University"] | ||
new_record_data["ht_id"].append("inu.30000108625017") | ||
return new_record_data | ||
|
||
|
||
# Create a CatalogRecordMetadata object with the updated catalog record | ||
@pytest.fixture() | ||
def get_update_catalog_record_metadata(update_catalog_record_metadata): | ||
return CatalogRecordMetadata(update_catalog_record_metadata) | ||
|
||
|
||
# Create a CatalogItemMetadata object with the updated catalog record and the ht_id of the item | ||
@pytest.fixture() | ||
def get_item_metadata_second_position(update_catalog_record_metadata: dict, | ||
get_update_catalog_record_metadata: CatalogRecordMetadata): | ||
"""Fake data updating the input document to test the second position of the htsource field""" | ||
|
||
return CatalogItemMetadata("inu.30000108625017", | ||
get_update_catalog_record_metadata) | ||
|
||
|
||
@pytest.fixture() | ||
def get_catalog_record_without_enum_pubdate(get_record_data): | ||
updating_record = deepcopy(get_record_data) | ||
updating_record[ | ||
"ht_json"] = '[{"htid":"nyp.33433069877805","newly_open":null,"ingest":"20220501","rights":["pdus",null],"heldby":["nypl"],"collection_code":"nyp","enumcron":"v. 1","dig_source":"google"}]' | ||
updating_record["ht_id"] = ["nyp.33433069877805"] | ||
return updating_record | ||
|
||
|
||
@pytest.fixture() | ||
def get_catalog_record_metadata_without_enum_pubdate(get_catalog_record_without_enum_pubdate): | ||
return CatalogRecordMetadata(get_catalog_record_without_enum_pubdate) | ||
|
||
|
||
@pytest.fixture() | ||
def get_item_metadata_without_enum_pubdate(get_catalog_record_without_enum_pubdate: dict, | ||
get_catalog_record_metadata_without_enum_pubdate: CatalogRecordMetadata): | ||
"""Fake data updating the input document to test the second position of the htsource field""" | ||
|
||
return CatalogItemMetadata("nyp.33433069877805", | ||
get_catalog_record_metadata_without_enum_pubdate) | ||
|
||
|
||
class TestCatalogMetadata: | ||
|
||
def test_catalog_record_metadata_class(self, get_catalog_record_metadata): | ||
assert 'ht_id' not in get_catalog_record_metadata.metadata.keys() | ||
assert 'htsource' in get_catalog_record_metadata.metadata.keys() | ||
assert 'vol_id' not in get_catalog_record_metadata.metadata.keys() | ||
|
||
def test_catalog_item_metadata_class(self, get_item_metadata): | ||
assert get_item_metadata.ht_id == "mdp.39015078560292" | ||
assert "mdp.39015078560292" == get_item_metadata.metadata.get('vol_id') | ||
assert "title" in get_item_metadata.metadata.keys() | ||
|
||
def test_get_item_htsource(self, get_item_metadata): | ||
htsource = get_item_metadata.get_item_htsource() | ||
assert htsource == "University of Michigan" | ||
|
||
def test_get_item_htsource_second_position(self, get_item_metadata_second_position): | ||
htsource = get_item_metadata_second_position.get_item_htsource() | ||
assert htsource == "Indiana University" | ||
|
||
def test_get_item_htsource_sharinghtsource(self, get_item_metadata): | ||
htsource = get_item_metadata.get_item_htsource() | ||
assert htsource == "University of Michigan" | ||
|
||
def test_get_volume_enumcron_empty(self): | ||
""" | ||
Some documents do not have the field volume_enumcrom, that is because it is an empty string in the second | ||
position. | ||
See here https://github.com/hathitrust/hathitrust_catalog_indexer/blob/main/indexers/common_ht.rb#L50 how this | ||
field is generated. | ||
:return: | ||
""" | ||
|
||
volume_enumcrom = "" | ||
ht_id_display = [ | ||
"mdp.39015078560292|20220910||1860|1860-1869|||Rābinsan Krūso kā itihāsa. The adventures of Robinson " | ||
"Crusoe, translated [into Hindi] by Badrī Lāla, from a Bengali version ..."] | ||
assert volume_enumcrom == ht_id_display[0].split("|")[2] | ||
|
||
def test_missed_enum_publish_date(self, get_item_metadata_without_enum_pubdate): | ||
doc_json = get_item_metadata_without_enum_pubdate.get_data_ht_json_obj() | ||
assert len(doc_json) == 0 | ||
|
||
def test_extract_enum_publish_date(self, get_item_metadata): | ||
doc_json = get_item_metadata.get_data_ht_json_obj() | ||
assert len(doc_json) == 1 |
Oops, something went wrong.