Merge pull request #5 from hathitrust/missed_fields_solr_docs

Missed fields solr docs
hathitrust · Dec 1, 2023 · 439d643 · 439d643
2 parents b05e297 + 0649dc5
commit 439d643
Show file tree

Hide file tree

Showing 6 changed files with 125 additions and 180 deletions.
diff --git a/docker-compose.yml b/docker-compose.yml
@@ -46,6 +46,7 @@ services:
   #    #  - indexer_net
   #    ports:
   #      - "8082:8082"
+  #solr6 image
   solr-lss-dev:
     container_name: solr-lss-dev
     image: ghcr.io/hathitrust/lss-solr:unstable
@@ -60,9 +61,24 @@ services:
     expose:
       - 8983
     environment:
-      - SOLR_HEAP=2g
+      - SOLR_HEAP=4g
     #volumes:
     #  - solr-lss-dev:/var/solr
+  #Solr8 cloud mode, see lss_solr_configs repository if you want to add dataLoader service
+  #solr-lss-dev:
+  #  image: ghcr.io/hathitrust/full-text-search-embedded_zoo:example-8.11
+  #  container_name: solr-lss-dev
+  #  ports:
+  #    - "8983:8983"
+  #  volumes:
+  #    - solr_data:/var/solr/data
+  #  command: solr-foreground -c
+  #  healthcheck:
+  #    test: [ "CMD-SHELL", "solr healthcheck -c core-x" ]
+  #    interval: 5s
+  #    timeout: 10s
+  #    start_period: 30s
+  #    retries: 5
   solr-sdr-catalog:
     container_name: solr-sdr-catalog
     image: ghcr.io/hathitrust/catalog-solr-sample
@@ -76,11 +92,6 @@ services:
       - "9033:9033"
     expose:
       - 9033
-    #- SOLR_HOST="solr-sdr-catalog"
-    #networks:
-    #  - indexer_net
-    #volumes:
-    #  - solr_sdr_catalog:/var/solr
   test:
     container_name: indexing_test
     build: .
@@ -93,8 +104,6 @@ services:
         condition: service_healthy
       solr-sdr-catalog:
         condition: service_healthy
-
-#networks:
-#  indexer_net:
 volumes:
-  tmp:
+  tmp:
+  solr_data:
diff --git a/document_generator/document_generator.py b/document_generator/document_generator.py
@@ -60,16 +60,24 @@ def get_catalog_identical_fields(metadata: Dict) -> Dict:
     @staticmethod
     def rename_catalog_fields(metadata: Dict) -> Dict:
         entry = {}
-        for field in RENAMED_CATALOG_METADATA.keys():
-            renamed_field = RENAMED_CATALOG_METADATA[field]
-            entry[renamed_field] = metadata.get(field)
+        for new_field in RENAMED_CATALOG_METADATA.keys():
+            catalog_field = RENAMED_CATALOG_METADATA[new_field]
+            entry[new_field] = metadata.get(catalog_field)
         return entry
 
     @staticmethod
     def get_volume_enumcron(ht_id_display: str = None):
         enumcron = ht_id_display[0].split("|")[2]
         return enumcron
 
+    @staticmethod
+    def get_data_ht_json_obj(ht_json: Dict = None):
+
+        catalog_json_data = {
+            "enumPublishDate": ht_json.get("enum_pubdate")
+        }
+        return catalog_json_data
+
     @staticmethod
     def get_item_htsource(
             id: str = None, catalog_htsource: List = None, catalog_htid: List = None
@@ -98,15 +106,29 @@ def retrieve_fields_from_Catalog_index(doc_id: str, metadata: Dict) -> Dict:
         volume_enumcron = DocumentGenerator.get_volume_enumcron(
             metadata.get("ht_id_display")
         )
+
+        doc_json = [record for record in json.loads(metadata.get("ht_json")) if
+                    (v := record.get('enum_pubdate') and doc_id == record.get('htid'))]
+
+        if len(doc_json) > 0:
+            entry.update(DocumentGenerator.get_data_ht_json_obj(doc_json[0]))
+
         if len(volume_enumcron) > 1:
             entry["volume_enumcron"] = volume_enumcron
         entry["htsource"] = DocumentGenerator.get_item_htsource(
             doc_id, metadata.get("htsource"), metadata.get("ht_id")
         )
+
+        if entry.get('date') and entry.get('enumPublishDate'):
+            entry.update({"bothPublishDate": entry.get("enumPublishDate")})
+
+        entry["vol_id"] = doc_id
         return entry
 
     @staticmethod
     def create_ocr_field(document_zip_path) -> Dict:
+        # TODO: As part of this function we could extract the following attributes
+        #  numPages, numChars, charsPerPage. In the future, these attributes could be use to measure query performance
         logging.info(f"Reading {document_zip_path}.zip file")
         full_text = DocumentGenerator.get_full_text_field(f"{document_zip_path}.zip")
         return {"ocr": full_text}

diff --git a/document_generator/document_generator_test.py b/document_generator/document_generator_test.py
@@ -3,6 +3,7 @@
 
 import sys
 import inspect
+import json
 
 import pytest
 import pytest_cov
@@ -115,3 +116,28 @@ def test_create_entry(self, get_document_generator):
         assert "nyp.33433082046503" in doc_metadata.get("content").get("response").get(
             "docs"
         )[0].get("ht_id")
+
+    # def test_not_mainauthor_document():
+    #    "ht_id" = ["mdp.39015064339677",
+    # "umn.31951000740320m"]
+
+    def test_missed_enumPublishDate(self, get_document_generator):
+        ht_json = "[{\"htid\":\"nyp.33433069877805\",\"newly_open\":null,\"ingest\":\"20220501\",\"rights\":[\"pdus\",null],\"heldby\":[\"nypl\"],\"collection_code\":\"nyp\",\"enumcron\":\"v. 1\",\"dig_source\":\"google\"}]"
+
+        doc_json = [record for record in json.loads(ht_json) if
+                    (v := record.get('enum_pubdate') and "nyp.33433069877805" == record.get('htid'))]
+
+        if len(doc_json) > 0:
+            entry = get_document_generator.get_data_ht_json_obj(doc_json[0])
+
+            assert "enumPublishDate" not in entry.keys()
+
+    def test_extract_enumPublishDate(self, get_document_generator):
+        ht_json = "[{\"htid\":\"mdp.39015082023097\",\"newly_open\":null,\"ingest\":\"20230114\",\"rights\":[\"pdus\",null],\"heldby\":[\"cornell\",\"emory\",\"harvard\",\"stanford\",\"uiowa\",\"umich\",\"umn\"],\"collection_code\":\"miu\",\"enumcron\":\"1958\",\"enum_pubdate\":\"1958\",\"enum_pubdate_range\":\"1950-1959\",\"dig_source\":\"google\"},{\"htid\":\"mdp.39015082023246\",\"newly_open\":null,\"ingest\":\"20230114\",\"rights\":[\"pdus\",null],\"heldby\":[\"cornell\",\"emory\",\"harvard\",\"stanford\",\"uiowa\",\"umich\",\"umn\"],\"collection_code\":\"miu\",\"enumcron\":\"1959\",\"enum_pubdate\":\"1959\",\"enum_pubdate_range\":\"1950-1959\",\"dig_source\":\"google\"}]"
+
+        doc_json = [record for record in json.loads(ht_json) if
+                    (v := record.get('enum_pubdate') and "mdp.39015082023097" == record.get('htid'))]
+
+        if len(doc_json) > 0:
+            entry = get_document_generator.get_data_ht_json_obj(doc_json[0])
+            assert "enumPublishDate" in entry.keys()
diff --git a/document_generator/indexer_config.py b/document_generator/indexer_config.py
@@ -8,57 +8,25 @@
 SDR_DIR = "/sdr1"
 TRANSLATE_TABLE = str.maketrans({"=": r"\=", ",": r"\,"})
 
-To_CHECK = [
-    "ht_cover_tag",
-    "ht_page_feature",
-    "ht_reading_order",
-    "ht_scanning_order",
-    "numPages",
-    "numChars",
-    "charsPerPage",
-    "seq",
-    "pgnum",
-    "type_s",
-    "chunk_seq",
-    "mainauthor",
-    "timestamp",
-    "ctrlnum",
-    "rptnum",
-    "isbn",
-    "edition",
-    "fullgenre",
-    "genre",
-    "hlb3Str",
-    "hlb3Delimited",
-    "enumPublishDate",
-    "bothPublishDate",
-    "era",
-    "fullgeographic",
-]
-
-# field catalog : field_full_text
+# field_full_text : field catalog
 RENAMED_CATALOG_METADATA = {
-    "id": "record_no",
-    "publishDate": "date",
-    "author": "Vauthor",
-    "title": "Vtitle",
-    "id": "vol_id"  # ,
-    # "ht_id": "id",
+    "record_no": "id",
+    "date": "publishDate",
+    "Vauthor": "author",
+    "Vtitle": "title"
 }
 
 IDENTICAL_CATALOG_METADATA = [
-    # 'id',
-    # 'ocr',
     "author",
     "author2",
-    # 'date',
-    # 'record_no',
-    # 'allfields',
     "lccn",
     "sdrnum",
+    "rptnum",
     "oclc",
     "issn",
-    "ht_id_display",  # Appear in full-text search schema do we want to keep it?
+    "isbn",
+    "edition",
+    # "ht_id_display",  # Appear in full-text search schema do we want to keep it?
     "isn_related",
     "callnumber",
     "sudoc",
@@ -73,6 +41,7 @@
     "author_rest",
     "authorSort",
     "author_sortkey",
+    "mainauthor",  # This is an optional field
     # ============================
     # ====Check title fields====
     "vtitle",
@@ -98,4 +67,6 @@
     "publishDate",
     "geographicStr",
     "countryOfPubStr",
+    "genre",
+    "era"
 ]
diff --git a/ht_document/ht_document_test.py b/ht_document/ht_document_test.py
@@ -1,3 +1,5 @@
+import json
+
 from pypairtree import pairtree
 from ht_document.ht_document import HtDocument