hathitrust · liseli · Jun 10, 2024 · Jun 7, 2024
diff --git a/README.md b/README.md
@@ -78,7 +78,7 @@ This is use is used to process a batch of documents selected from production. It
 application in Kubernetes and considering production data.
 
 **Use case 2: Generating and indexing long documents in Full-text search index:**
-There are some documents that exceed the maximum size of message allowing by the queue system. In this case, only a
+There are some documents that exceed the maximum size of a message allowing by the queue system. In this case, only a
 queue containing the metadata extracted from Catalog index is used. After that, the components for generating and
 indexing the documents are used in sequence in a local environment.
 
@@ -87,7 +87,7 @@ This use case is implemented by a python script that retrieves the id to populat
 use case is used to process in Kubernetes a batch of documents selected from production. See the section
 ``Run retriever service by file`` to find the command to run this use case.
 
-## How to test locally indexer service
+## How to test local indexer service
 
 In your workdir:
 
@@ -150,7 +150,7 @@ index.
 
 ## Use case for processing documents retrieved from a file
 
-The file is created using the Catalog index. The file contains the list of documents to be processed and it is stored
+The file is created using the Catalog index. The file contains the list of documents to be processed, and it is stored
 in the root of this repository by default. e.g. ~/ht_indexer/filter_ids.txt
 
 ```docker compose exec document_retriever python run_retriever_service_by_file.py```
@@ -190,6 +190,32 @@ http://localhost:8983/solr/#/core-x/ --document_local_path ~/tmp/indexing_data`
         5. Passing arguments to generate the sample of data:
            ```./ht_utils/sample_data/sample_data_creator.sh 0.0011 /sdr1/obj```
 
+### Running test of each service:
+
+In the working directory,
+
+* Create the image
+
+```docker build -t document_generator .```
+
+* Run document_retriever_service container and test it
+
+```docker compose up document_retriever -d```
+
+```docker compose exec document_retriever pytest document_retriever_service catalog_metadata ht_utils ```
+
+* Run document_generator_service container and test it
+
+```docker compose up document_generator -d```
+
+```docker compose exec document_generator pytest document_generator ht_document ht_queue_service ht_utils```
+
+* Run document_indexer_service container and test it
+
+```docker compose up document_indexer -d```
+
+```docker compose exec document_indexer pytest ht_indexer_api ht_queue_service```
+
 ## Data Sampling:
 
 FY: [Temporal solution until implement the queue system]

diff --git a/document_generator/generator_arguments.py b/document_generator/generator_arguments.py
@@ -14,6 +14,7 @@ def get_mysql_conn():
     # MySql connection
     try:
         mysql_host = os.getenv("MYSQL_HOST", "mysql-sdr")
+        logger.info(f"Connected to MySql_Host: {mysql_host}")
     except KeyError:
         logger.error("Error: `MYSQL_HOST` environment variable required")
         sys.exit(1)

diff --git a/document_generator/mysql_data_extractor.py b/document_generator/mysql_data_extractor.py
@@ -8,7 +8,7 @@
 def create_coll_id_field(large_coll_id_result: dict) -> dict:
     if len(large_coll_id_result) > 0:
         # Obtain the list with the unique coll_id from the result
-        return {"coll_id": set(list(large_coll_id_result.get("MColl_ID").values()))}
+        return {"coll_id": list(set([item.get("MColl_ID") for item in large_coll_id_result]))}
     else:
         return {"coll_id": [0]}
 
@@ -56,27 +56,33 @@ def add_large_coll_id_field(self, doc_id: str) -> [dict]:
                                     f'WHERE mb_item.extern_item_id="{doc_id}" '
                                     f'AND mb_coll.num_items > {indexer_config.MAX_ITEM_IDS} ')
 
+        logger.info(f"MySQL query: {query_item_in_large_coll}")
         large_collection_id = self.mysql_obj.query_mysql(query_item_in_large_coll)
 
         return large_collection_id
 
     def add_rights_field(self, doc_id) -> list[tuple]:
+
         namespace, _id = doc_id.split(".")
         query = (
             f'SELECT * FROM rights_current WHERE namespace="{namespace}" AND id="{_id}"'
         )
+        logger.info(f"MySQL query: {query}")
         return self.mysql_obj.query_mysql(query)
 
     def add_ht_heldby_field(self, doc_id) -> list[tuple]:
         query = (
             f'SELECT member_id FROM holdings_htitem_htmember WHERE volume_id="{doc_id}"'
         )
+
+        logger.info(f"MySQL query: {query}")
         # ht_heldby is a list of institutions
         return self.mysql_obj.query_mysql(query)
 
     def add_heldby_brlm_field(self, doc_id) -> list[tuple]:
         query = f'SELECT member_id FROM holdings_htitem_htmember WHERE volume_id="{doc_id}" AND access_count > 0'
 
+        logger.info(f"MySQL query: {query}")
         return self.mysql_obj.query_mysql(query)
 
     def retrieve_mysql_data(self, doc_id):
@@ -89,12 +95,12 @@ def retrieve_mysql_data(self, doc_id):
         if len(doc_rights) == 1:
             entry.update({"rights": doc_rights[0].get("attr")})
 
-        # It is a list of members, if the query result is empty the field does not appear in Solr index
+        # It is a list of members, if the query result is empty, the field does not appear in Solr index
         ht_heldby = self.add_ht_heldby_field(doc_id)
         if len(ht_heldby) > 0:
             entry.update(create_ht_heldby_field(ht_heldby))
 
-        # It is a list of members, if the query result is empty the field does not appear in Solr index
+        # It is a list of members, if the query result is empty, the field does not appear in Solr index
         heldby_brlm = self.add_heldby_brlm_field(doc_id)
 
         if len(heldby_brlm) > 0: