Merge pull request #8 from hathitrust/DEV-1040-re-structureByservice

Dev 1040 re structure byservice
hathitrust · Mar 20, 2024 · bc9d58f · bc9d58f
2 parents 0380dd0 + c59d84c
commit bc9d58f
Show file tree

Hide file tree

Showing 36 changed files with 21,483 additions and 1,172 deletions.
diff --git a/Dockerfile b/Dockerfile
@@ -60,6 +60,6 @@ WORKDIR /app
 
 COPY . .
 
-CMD ["python3"]
+CMD ["tail", "-f", "/dev/null"]
 
 USER appuser
diff --git a/README.md b/README.md
@@ -9,6 +9,39 @@ This application instantiates two solr servers, through docker containers. Catal
 documents and Full-text (solr-lss-dev) search index for indexing them.
 Both containers, must be running before load the API, so the docker-compose.yml file takes care of it.
 
+## Use cases
+
+For all the use cases, the query look like:
+
+``` --query id:100673101 
+    --query ht_id:umn.31951d01828300z
+    --query *:* default query to retrieve all the documents in Catalog
+```
+
+1- Create a TXT file listing ht_id from Catalog index
+`python ht_indexer/document_retriever_service/catalog_retriever_service.py --query id:100673101
+--output_file ~/tmp/ht_ids.txt`
+
+* By default, the file will be created in the folder the root of the project
+
+2- Generate full-text search documents for all the items in Catalog index
+`python ht_indexer/document_retriever_service/full_text_search_retriever_service.py --query id:100673101
+--document_local_path ~/tmp`
+
+* The query parameter could be *:* to retrieve all the documents in Catalog index
+
+3- Generate full-text search documents given ht_id
+`python ht_indexer/document_retriever_service/full_text_search_retriever_service.py --query ht_id:umn.31951d01828300z
+--document_local_path ~/tmp`
+
+4- Retrieve files from pairtree-based repository
+`python ~/ht_indexer/document_retriever_service/full_text_search_retriever_by_file.py
+--list_ids_path /Users/lisepul/Documents/repositories/python/ht_indexer/filter_ids.txt`
+
+4- Index the documents in full-text search index
+`python3 ~/ht_indexer/document_indexer_service/document_indexer_service.py --solr_indexing_api
+http://localhost:8983/solr/#/core-x/ --document_local_path ~/tmp/indexing_data`
+
 ## Setting up ht_indexer
 
 1. Clone the repository in your working environment
@@ -189,6 +222,26 @@ On mac,
     * `` poetry export -f requirements.txt --output requirements.txt ``
     * Use `` poetry update `` if you change your .toml file and want to generate a new version the .lock file
 
+## How to test locally indexer service
+
+In your workdir:
+
+Step 1. Create /sdr1/obj directory
+`mkdir ../sdr1/obj`
+
+Step 2. Retrieve from pairtree repository data for testing
+`scp $HT_SSH_HOST:/sdr1/obj/umn/pairtree_root/31/95/1d/03/01/41/20/v/31951d03014120v/31951d03014120v{.zip,mets.xml} ../sdr1/obj`
+
+Step 3. Create the image
+`docker build -t document_generator .
+docker compose up document_retriever -d`
+
+Step 4. export MYSQL_USER=
+export MYSQL_PASS=
+
+Step 5. Generate document
+`docker exec document_retriever python document_retriever_service/full_text_search_retriever_service.py --query ht_id:mb.39015078560292 --document_local_path /Users/lisepul/Documents/repositories/python/tmp --document_repository local`
+
 ## DockerFile explanations
 
 **What is the best python Docker image to use?**
@@ -227,6 +280,10 @@ export PUBLIC_KEY=public_key_name
 
 Reference used for python implementation
 
+Python Linter:
+Ruff: https://astral.sh/ruff
+Enhancing Python Code Quality: A Comprehensive Guide to Linting with
+Ruff: https://dev.to/ken_mwaura1/enhancing-python-code-quality-a-comprehensive-guide-to-linting-with-ruff-3d6g
 Parser XML files
 https://lxml.de/tutorial.html#parsing-from-strings-and-files
 https://pymotw.com/3/xml.etree.ElementTree/parse.html

diff --git a/catalog_metadata/__init__.py b/catalog_metadata/__init__.py
diff --git a/catalog_metadata/catalog_metadata.py b/catalog_metadata/catalog_metadata.py
@@ -0,0 +1,105 @@
+import json
+
+from indexer_config import IDENTICAL_CATALOG_METADATA, RENAMED_CATALOG_METADATA
+
+
+class CatalogRecordMetadata:
+    """This class is used to retrieve the metadata of a specific item in the Catalog"""
+
+    def __init__(self, record: dict):
+        self.record = record
+        self.metadata = self.get_metadata()
+
+    def get_metadata(self) -> dict:
+
+        """Create a dictionary with the fulltext fields extracted from catalog metadata"""
+        metadata = {}
+
+        metadata.update(self.get_catalog_identical_fields())
+        metadata.update(self.rename_catalog_fields())
+
+        # Create bothPublishDate field
+        if self.record.get("date") and self.record.get("enumPublishDate"):
+            metadata.update({"bothPublishDate": self.record.get("enumPublishDate")})
+
+        return metadata
+
+    def get_catalog_identical_fields(self) -> dict:
+
+        """Retrieve the fields that have identical names in the catalog and fulltext documents."""
+        entry = {}
+        for field in IDENTICAL_CATALOG_METADATA:
+            value = self.record.get(field)
+            if value:
+                entry[field] = value
+        return entry
+
+    def rename_catalog_fields(self) -> dict:
+        """Rename the fields from the catalog to the ones used in the fulltext documents."""
+        entry = {}
+        for new_field in RENAMED_CATALOG_METADATA.keys():
+            catalog_field = RENAMED_CATALOG_METADATA[new_field]
+            entry[new_field] = self.record.get(catalog_field)
+        return entry
+
+
+class CatalogItemMetadata:
+    """This class is used to retrieve the metadata of a specific item in the Catalog"""
+
+    def __init__(self, ht_id: str, record_metadata: CatalogRecordMetadata = None):
+
+        self.record_metadata = record_metadata
+        self.ht_id = ht_id
+        metadata = self.get_metadata()
+
+        # Merge both dictionaries
+        self.metadata = {**self.record_metadata.metadata, **metadata}
+
+    def get_volume_enumcron(self) -> list:
+        try:
+            return self.record_metadata.record.get("ht_id_display")[0].split("|")[2]
+        except IndexError:
+            return []
+
+    def get_metadata(self) -> dict:
+
+        metadata = {}
+
+        volume_enumcron = self.get_volume_enumcron()
+
+        doc_json = self.get_data_ht_json_obj()
+
+        if len(doc_json) > 0:
+            metadata["enumPublishDate"] = doc_json[0].get("ht_json")
+
+        if len(volume_enumcron) > 1:
+            metadata["volume_enumcron"] = volume_enumcron
+        metadata["htsource"] = self.get_item_htsource()
+
+        metadata["vol_id"] = self.ht_id
+        return metadata
+
+    def get_data_ht_json_obj(self) -> list:
+        """Obtain the publication data of a specific item in the catalog."""
+        doc_json = [
+            item
+            for item in json.loads(self.record_metadata.record.get("ht_json"))
+            if (_v := item.get("enum_pubdate") and self.ht_id == item.get("htid"))
+        ]
+
+        return doc_json
+
+    def get_item_htsource(self) -> str:
+        """
+        In catalog it could be a list of sources, should obtain the source of a specific item
+        :param id: Catalod ht_id field
+        :param catalog_htsource: catalog item source
+        :param catalog_htid: catalog item ht_id
+        :return:
+        """
+        item_position = self.record_metadata.record.get("ht_id").index(self.ht_id)
+        try:
+            htsource = self.record_metadata.record.get("htsource")[item_position]
+        except IndexError:
+            htsource = self.record_metadata.record.get("htsource")[0]
+        return htsource
diff --git a/catalog_metadata/catalog_metadata_test.py b/catalog_metadata/catalog_metadata_test.py
@@ -0,0 +1,127 @@
+import inspect
+import json
+import os
+from copy import deepcopy
+
+import pytest
+
+from catalog_metadata.catalog_metadata import CatalogItemMetadata, CatalogRecordMetadata
+
+current = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
+
+
+# Retrieve JSON file to create a dictionary with a catalog record
+@pytest.fixture()
+def get_record_data():
+    with open(os.path.join(current, "data/catalog.json"), "r", ) as file:
+        data = json.load(file)
+
+    return data
+
+
+# Use the catalog record to create a CatalogRecordMetadata object
+@pytest.fixture()
+def get_catalog_record_metadata(get_record_data):
+    return CatalogRecordMetadata(get_record_data)
+
+
+# Create a CatalogItemMetadata object with the catalog record and the ht_id of the item
+@pytest.fixture()
+def get_item_metadata(get_record_data: dict, get_catalog_record_metadata: CatalogRecordMetadata):
+    return CatalogItemMetadata("mdp.39015078560292", get_catalog_record_metadata)
+
+
+# Update some fields of the catalog record to test some functions that retrieve specific fields
+@pytest.fixture()
+def update_catalog_record_metadata(get_record_data):
+    new_record_data = deepcopy(get_record_data)
+    new_record_data["htsource"] = ["University of Michigan", "Indiana University"]
+    new_record_data["ht_id"].append("inu.30000108625017")
+    return new_record_data
+
+
+# Create a CatalogRecordMetadata object with the updated catalog record
+@pytest.fixture()
+def get_update_catalog_record_metadata(update_catalog_record_metadata):
+    return CatalogRecordMetadata(update_catalog_record_metadata)
+
+
+# Create a CatalogItemMetadata object with the updated catalog record and the ht_id of the item
+@pytest.fixture()
+def get_item_metadata_second_position(update_catalog_record_metadata: dict,
+                                      get_update_catalog_record_metadata: CatalogRecordMetadata):
+    """Fake data updating the input document to test the second position of the htsource field"""
+
+    return CatalogItemMetadata("inu.30000108625017",
+                               get_update_catalog_record_metadata)
+
+
+@pytest.fixture()
+def get_catalog_record_without_enum_pubdate(get_record_data):
+    updating_record = deepcopy(get_record_data)
+    updating_record[
+        "ht_json"] = '[{"htid":"nyp.33433069877805","newly_open":null,"ingest":"20220501","rights":["pdus",null],"heldby":["nypl"],"collection_code":"nyp","enumcron":"v. 1","dig_source":"google"}]'
+    updating_record["ht_id"] = ["nyp.33433069877805"]
+    return updating_record
+
+
+@pytest.fixture()
+def get_catalog_record_metadata_without_enum_pubdate(get_catalog_record_without_enum_pubdate):
+    return CatalogRecordMetadata(get_catalog_record_without_enum_pubdate)
+
+
+@pytest.fixture()
+def get_item_metadata_without_enum_pubdate(get_catalog_record_without_enum_pubdate: dict,
+                                           get_catalog_record_metadata_without_enum_pubdate: CatalogRecordMetadata):
+    """Fake data updating the input document to test the second position of the htsource field"""
+
+    return CatalogItemMetadata("nyp.33433069877805",
+                               get_catalog_record_metadata_without_enum_pubdate)
+
+
+class TestCatalogMetadata:
+
+    def test_catalog_record_metadata_class(self, get_catalog_record_metadata):
+        assert 'ht_id' not in get_catalog_record_metadata.metadata.keys()
+        assert 'htsource' in get_catalog_record_metadata.metadata.keys()
+        assert 'vol_id' not in get_catalog_record_metadata.metadata.keys()
+
+    def test_catalog_item_metadata_class(self, get_item_metadata):
+        assert get_item_metadata.ht_id == "mdp.39015078560292"
+        assert "mdp.39015078560292" == get_item_metadata.metadata.get('vol_id')
+        assert "title" in get_item_metadata.metadata.keys()
+
+    def test_get_item_htsource(self, get_item_metadata):
+        htsource = get_item_metadata.get_item_htsource()
+        assert htsource == "University of Michigan"
+
+    def test_get_item_htsource_second_position(self, get_item_metadata_second_position):
+        htsource = get_item_metadata_second_position.get_item_htsource()
+        assert htsource == "Indiana University"
+
+    def test_get_item_htsource_sharinghtsource(self, get_item_metadata):
+        htsource = get_item_metadata.get_item_htsource()
+        assert htsource == "University of Michigan"
+
+    def test_get_volume_enumcron_empty(self):
+        """
+        Some documents do not have the field volume_enumcrom, that is because it is an empty string in the second
+        position.
+        See here https://github.com/hathitrust/hathitrust_catalog_indexer/blob/main/indexers/common_ht.rb#L50 how this
+        field is generated.
+        :return:
+        """
+
+        volume_enumcrom = ""
+        ht_id_display = [
+            "mdp.39015078560292|20220910||1860|1860-1869|||Rābinsan Krūso kā itihāsa. The adventures of Robinson "
+            "Crusoe, translated [into Hindi] by Badrī Lāla, from a Bengali version ..."]
+        assert volume_enumcrom == ht_id_display[0].split("|")[2]
+
+    def test_missed_enum_publish_date(self, get_item_metadata_without_enum_pubdate):
+        doc_json = get_item_metadata_without_enum_pubdate.get_data_ht_json_obj()
+        assert len(doc_json) == 0
+
+    def test_extract_enum_publish_date(self, get_item_metadata):
+        doc_json = get_item_metadata.get_data_ht_json_obj()
+        assert len(doc_json) == 1