Skip to content

Commit

Permalink
Merge pull request #5 from hathitrust/missed_fields_solr_docs
Browse files Browse the repository at this point in the history
Missed fields solr docs
  • Loading branch information
liseli authored Dec 1, 2023
2 parents b05e297 + 0649dc5 commit 439d643
Show file tree
Hide file tree
Showing 6 changed files with 125 additions and 180 deletions.
29 changes: 19 additions & 10 deletions docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ services:
# # - indexer_net
# ports:
# - "8082:8082"
#solr6 image
solr-lss-dev:
container_name: solr-lss-dev
image: ghcr.io/hathitrust/lss-solr:unstable
Expand All @@ -60,9 +61,24 @@ services:
expose:
- 8983
environment:
- SOLR_HEAP=2g
- SOLR_HEAP=4g
#volumes:
# - solr-lss-dev:/var/solr
#Solr8 cloud mode, see lss_solr_configs repository if you want to add dataLoader service
#solr-lss-dev:
# image: ghcr.io/hathitrust/full-text-search-embedded_zoo:example-8.11
# container_name: solr-lss-dev
# ports:
# - "8983:8983"
# volumes:
# - solr_data:/var/solr/data
# command: solr-foreground -c
# healthcheck:
# test: [ "CMD-SHELL", "solr healthcheck -c core-x" ]
# interval: 5s
# timeout: 10s
# start_period: 30s
# retries: 5
solr-sdr-catalog:
container_name: solr-sdr-catalog
image: ghcr.io/hathitrust/catalog-solr-sample
Expand All @@ -76,11 +92,6 @@ services:
- "9033:9033"
expose:
- 9033
#- SOLR_HOST="solr-sdr-catalog"
#networks:
# - indexer_net
#volumes:
# - solr_sdr_catalog:/var/solr
test:
container_name: indexing_test
build: .
Expand All @@ -93,8 +104,6 @@ services:
condition: service_healthy
solr-sdr-catalog:
condition: service_healthy

#networks:
# indexer_net:
volumes:
tmp:
tmp:
solr_data:
28 changes: 25 additions & 3 deletions document_generator/document_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,16 +60,24 @@ def get_catalog_identical_fields(metadata: Dict) -> Dict:
@staticmethod
def rename_catalog_fields(metadata: Dict) -> Dict:
entry = {}
for field in RENAMED_CATALOG_METADATA.keys():
renamed_field = RENAMED_CATALOG_METADATA[field]
entry[renamed_field] = metadata.get(field)
for new_field in RENAMED_CATALOG_METADATA.keys():
catalog_field = RENAMED_CATALOG_METADATA[new_field]
entry[new_field] = metadata.get(catalog_field)
return entry

@staticmethod
def get_volume_enumcron(ht_id_display: str = None):
enumcron = ht_id_display[0].split("|")[2]
return enumcron

@staticmethod
def get_data_ht_json_obj(ht_json: Dict = None):

catalog_json_data = {
"enumPublishDate": ht_json.get("enum_pubdate")
}
return catalog_json_data

@staticmethod
def get_item_htsource(
id: str = None, catalog_htsource: List = None, catalog_htid: List = None
Expand Down Expand Up @@ -98,15 +106,29 @@ def retrieve_fields_from_Catalog_index(doc_id: str, metadata: Dict) -> Dict:
volume_enumcron = DocumentGenerator.get_volume_enumcron(
metadata.get("ht_id_display")
)

doc_json = [record for record in json.loads(metadata.get("ht_json")) if
(v := record.get('enum_pubdate') and doc_id == record.get('htid'))]

if len(doc_json) > 0:
entry.update(DocumentGenerator.get_data_ht_json_obj(doc_json[0]))

if len(volume_enumcron) > 1:
entry["volume_enumcron"] = volume_enumcron
entry["htsource"] = DocumentGenerator.get_item_htsource(
doc_id, metadata.get("htsource"), metadata.get("ht_id")
)

if entry.get('date') and entry.get('enumPublishDate'):
entry.update({"bothPublishDate": entry.get("enumPublishDate")})

entry["vol_id"] = doc_id
return entry

@staticmethod
def create_ocr_field(document_zip_path) -> Dict:
# TODO: As part of this function we could extract the following attributes
# numPages, numChars, charsPerPage. In the future, these attributes could be use to measure query performance
logging.info(f"Reading {document_zip_path}.zip file")
full_text = DocumentGenerator.get_full_text_field(f"{document_zip_path}.zip")
return {"ocr": full_text}
Expand Down
26 changes: 26 additions & 0 deletions document_generator/document_generator_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

import sys
import inspect
import json

import pytest
import pytest_cov
Expand Down Expand Up @@ -115,3 +116,28 @@ def test_create_entry(self, get_document_generator):
assert "nyp.33433082046503" in doc_metadata.get("content").get("response").get(
"docs"
)[0].get("ht_id")

# def test_not_mainauthor_document():
# "ht_id" = ["mdp.39015064339677",
# "umn.31951000740320m"]

def test_missed_enumPublishDate(self, get_document_generator):
ht_json = "[{\"htid\":\"nyp.33433069877805\",\"newly_open\":null,\"ingest\":\"20220501\",\"rights\":[\"pdus\",null],\"heldby\":[\"nypl\"],\"collection_code\":\"nyp\",\"enumcron\":\"v. 1\",\"dig_source\":\"google\"}]"

doc_json = [record for record in json.loads(ht_json) if
(v := record.get('enum_pubdate') and "nyp.33433069877805" == record.get('htid'))]

if len(doc_json) > 0:
entry = get_document_generator.get_data_ht_json_obj(doc_json[0])

assert "enumPublishDate" not in entry.keys()

def test_extract_enumPublishDate(self, get_document_generator):
ht_json = "[{\"htid\":\"mdp.39015082023097\",\"newly_open\":null,\"ingest\":\"20230114\",\"rights\":[\"pdus\",null],\"heldby\":[\"cornell\",\"emory\",\"harvard\",\"stanford\",\"uiowa\",\"umich\",\"umn\"],\"collection_code\":\"miu\",\"enumcron\":\"1958\",\"enum_pubdate\":\"1958\",\"enum_pubdate_range\":\"1950-1959\",\"dig_source\":\"google\"},{\"htid\":\"mdp.39015082023246\",\"newly_open\":null,\"ingest\":\"20230114\",\"rights\":[\"pdus\",null],\"heldby\":[\"cornell\",\"emory\",\"harvard\",\"stanford\",\"uiowa\",\"umich\",\"umn\"],\"collection_code\":\"miu\",\"enumcron\":\"1959\",\"enum_pubdate\":\"1959\",\"enum_pubdate_range\":\"1950-1959\",\"dig_source\":\"google\"}]"

doc_json = [record for record in json.loads(ht_json) if
(v := record.get('enum_pubdate') and "mdp.39015082023097" == record.get('htid'))]

if len(doc_json) > 0:
entry = get_document_generator.get_data_ht_json_obj(doc_json[0])
assert "enumPublishDate" in entry.keys()
53 changes: 12 additions & 41 deletions document_generator/indexer_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,57 +8,25 @@
SDR_DIR = "/sdr1"
TRANSLATE_TABLE = str.maketrans({"=": r"\=", ",": r"\,"})

To_CHECK = [
"ht_cover_tag",
"ht_page_feature",
"ht_reading_order",
"ht_scanning_order",
"numPages",
"numChars",
"charsPerPage",
"seq",
"pgnum",
"type_s",
"chunk_seq",
"mainauthor",
"timestamp",
"ctrlnum",
"rptnum",
"isbn",
"edition",
"fullgenre",
"genre",
"hlb3Str",
"hlb3Delimited",
"enumPublishDate",
"bothPublishDate",
"era",
"fullgeographic",
]

# field catalog : field_full_text
# field_full_text : field catalog
RENAMED_CATALOG_METADATA = {
"id": "record_no",
"publishDate": "date",
"author": "Vauthor",
"title": "Vtitle",
"id": "vol_id" # ,
# "ht_id": "id",
"record_no": "id",
"date": "publishDate",
"Vauthor": "author",
"Vtitle": "title"
}

IDENTICAL_CATALOG_METADATA = [
# 'id',
# 'ocr',
"author",
"author2",
# 'date',
# 'record_no',
# 'allfields',
"lccn",
"sdrnum",
"rptnum",
"oclc",
"issn",
"ht_id_display", # Appear in full-text search schema do we want to keep it?
"isbn",
"edition",
# "ht_id_display", # Appear in full-text search schema do we want to keep it?
"isn_related",
"callnumber",
"sudoc",
Expand All @@ -73,6 +41,7 @@
"author_rest",
"authorSort",
"author_sortkey",
"mainauthor", # This is an optional field
# ============================
# ====Check title fields====
"vtitle",
Expand All @@ -98,4 +67,6 @@
"publishDate",
"geographicStr",
"countryOfPubStr",
"genre",
"era"
]
2 changes: 2 additions & 0 deletions ht_document/ht_document_test.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import json

from pypairtree import pairtree
from ht_document.ht_document import HtDocument

Expand Down
Loading

0 comments on commit 439d643

Please sign in to comment.