diff --git a/README.md b/README.md index 47e8576..b756e6c 100644 --- a/README.md +++ b/README.md @@ -71,12 +71,26 @@ systems involved in the flow to index documents in Full-text search index. The q In your workdir: 1. Clone the repository: + ```git clone git@github.com:hathitrust/ht_indexer.git``` 2. Go to the folder ``cd ht_indexer`` 3. Create the image + `docker build -t document_generator .` -4. Run the container - `docker compose up document_retriever -d` + +4. Run the services + + 1. Retriever service + + `docker compose --profile retriever up document_retriever -d` + + 2. Generator service + + `docker compose --profile generator up document_generator -d` + + 3. Indexer service + + `docker compose --profile indexer up document_indexer -d` If you want to run the application in your local environment and outside the docker container, you should follow the steps mentioned in the section [How to set up your python environment](#project-set-up-local-environment) @@ -241,7 +255,7 @@ docker compose exec document_retriever python document_retriever_service/run_ret * **Generator service** ``` -docker compose up document_generator -d +docker compose --profile generator up document_generator -d ``` This container will automatically start the script `python document_generator/document_generator_service.py` that will @@ -250,7 +264,7 @@ be retrieving the documents from the retriever_queue and will be published a new * **Indexer service** ``` -docker compose up document_indexer -d +docker compose --profile indexer up document_indexer -d ``` This container will automatically start the script `python document_indexer_service/document_indexer_service.py` that @@ -296,21 +310,27 @@ In the working directory, * Run document_retriever_service container and test it -```docker compose up document_retriever -d``` +```docker compose --profile retriever up document_retriever -d``` ```docker compose exec document_retriever pytest document_retriever_service catalog_metadata ht_utils ``` * Run document_generator_service container and test it -```docker compose up document_generator -d``` +```docker compose --profile generator up document_generator -d``` ```docker compose exec document_generator pytest document_generator ht_document ht_queue_service ht_utils``` * Run document_indexer_service container and test it -```docker compose up document_indexer -d``` +Solr server required authentication, so you should set up the environment variables SOLR_USER and SOLR_PASSWORD before +starting the container. + +export SOLR_USER=your_solr_username +export SOLR_PASS=your_solr_password + +```docker compose --profile indexer up document_indexer -d``` -```docker compose exec document_indexer pytest ht_indexer_api ht_queue_service``` +```docker compose exec document_indexer pytest document_indexer_service ht_indexer_api ht_queue_service``` ## Hosting diff --git a/docker-compose.yml b/docker-compose.yml index 1ec0f95..a635ca2 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1,5 +1,3 @@ -version: "3" - services: document_retriever: container_name: document_retriever @@ -24,6 +22,7 @@ services: condition: service_healthy tty: true stdin_open: true + profiles: [ retriever ] mysql-sdr: image: ghcr.io/hathitrust/db-image volumes: @@ -48,20 +47,51 @@ services: "--connect" ] timeout: 30s + profiles: [ generator ] solr-lss-dev: - image: ghcr.io/hathitrust/full-text-search-embedded_zoo:example-8.11 + image: ghcr.io/hathitrust/full-text-search-cloud:shards-docker container_name: solr-lss-dev ports: - "8983:8983" + environment: + - ZK_HOST=zoo1:2181 + - SOLR_OPTS=-XX:-UseLargePages + - SOLR_USER=${SOLR_USER:-solr} + - SOLR_PASSWORD=${SOLR_PASSWORD:-solrRocks} + depends_on: + zoo1: + condition: service_healthy volumes: - - solr_data:/var/solr/data - command: solr-foreground -c + - solr1_data:/var/solr/data + # start solr in the background, wait for it to start, then create the collection + command: [ "sh", "-c", 'solr-foreground -c & sleep 150 && export SOLR_AUTHENTICATION_OPTS=-Dbasicauth="$SOLR_USER":"$SOLR_PASSWORD" && solr create_collection -d /opt/solr/core-x -c core-x -shards 1 -replicationFactor 1 -p 8983 && wait' ] healthcheck: - test: [ "CMD-SHELL", "solr healthcheck -c core-x" ] - interval: 5s + test: [ "CMD-SHELL", "solr healthcheck -c core-x || echo 'Healthcheck failed'" ] + interval: 30s timeout: 10s - start_period: 30s retries: 5 + profiles: [ indexer ] + zoo1: + image: zookeeper:3.8.0 + container_name: zoo1 + restart: always + hostname: zoo1 + ports: + - 2181:2181 + - 7001:7000 + environment: + ZOO_MY_ID: 1 + ZOO_SERVERS: server.1=zoo1:2888:3888;2181 + ZOO_4LW_COMMANDS_WHITELIST: mntr, conf, ruok + ZOO_CFG_EXTRA: "metricsProvider.className=org.apache.zookeeper.metrics.prometheus.PrometheusMetricsProvider metricsProvider.httpPort=7000 metricsProvider.exportJvmInfo=true" + volumes: + - zoo1_data:/data + healthcheck: + test: [ "CMD", "echo", "ruok", "|", "nc", "localhost", "2181", "|", "grep", "imok" ] + interval: 30s + timeout: 10s + retries: 5 + profiles: [ indexer ] solr-sdr-catalog: container_name: solr-sdr-catalog image: ghcr.io/hathitrust/catalog-solr-sample @@ -75,20 +105,20 @@ services: - "9033:9033" expose: - 9033 - test: - container_name: indexing_test + profiles: [ retriever ] + test_retriever: + container_name: retriever_test build: . volumes: - .:/app - ../tmp:/tmp - command: [ "pytest" ] + command: [ "pytest document_retriever_service catalog_metadata ht_utils" ] depends_on: solr-sdr-catalog: condition: service_healthy - solr-lss-dev: - condition: service_healthy rabbitmq: condition: service_healthy + profiles: [ retriever ] document_generator: container_name: document_generator image: document_generator @@ -112,6 +142,18 @@ services: - TGT_QUEUE_PASS=guest - TGT_QUEUE_USER=guest command: [ "python", "document_generator/document_generator_service.py" ] + profiles: [ generator ] + test_generator: + container_name: generator_test + build: . + volumes: + - .:/app + - ../tmp:/tmp + command: [ "pytest document_generator ht_document ht_queue_service ht_utils" ] + depends_on: + rabbitmq: + condition: service_healthy + profiles: [ generator ] document_indexer: container_name: document_indexer image: document_generator @@ -131,7 +173,10 @@ services: - QUEUE_NAME=indexer_queue - QUEUE_PASS=guest - QUEUE_USER=guest + - SOLR_USER=${SOLR_USER:-solr} + - SOLR_PASSWORD=${SOLR_PASSWORD:-solrRocks} command: [ "python", "document_indexer_service/document_indexer_service.py", "--solr_indexing_api", "http://solr-lss-dev:8983/solr/#/core-x/" ] + profiles: [ indexer ] rabbitmq: container_name: rabbitmq image: rabbitmq:3.13-management @@ -147,7 +192,22 @@ services: timeout: 10s start_period: 30s retries: 5 + profiles: [ retriever, generator, indexer ] + test_indexer: + container_name: indexer_test + build: . + volumes: + - .:/app + - ../tmp:/tmp + command: [ "pytest ht_indexer_api ht_queue_service" ] + depends_on: + rabbitmq: + condition: service_healthy + solr-lss-dev: + condition: service_healthy + profiles: [ indexer ] volumes: - solr_data: mysql_sdr_data: + solr1_data: null + zoo1_data: null diff --git a/document_indexer_service/indexer_arguments.py b/document_indexer_service/indexer_arguments.py index 163f4ad..088f250 100644 --- a/document_indexer_service/indexer_arguments.py +++ b/document_indexer_service/indexer_arguments.py @@ -29,7 +29,12 @@ def __init__(self, parser): self.args = parser.parse_args() - self.solr_api_full_text = HTSolrAPI(url=self.args.solr_indexing_api) + solr_user = os.getenv("SOLR_USER") + solr_password = os.getenv("SOLR_PASSWORD") + + self.solr_api_full_text = HTSolrAPI(url=self.args.solr_indexing_api, + user=solr_user, + password=solr_password) self.document_local_path = self.args.document_local_path diff --git a/ht_indexer_api/ht_indexer_api.py b/ht_indexer_api/ht_indexer_api.py index 9a6955f..9caea7d 100644 --- a/ht_indexer_api/ht_indexer_api.py +++ b/ht_indexer_api/ht_indexer_api.py @@ -2,6 +2,7 @@ from typing import Text import requests +from requests.auth import HTTPBasicAuth from ht_utils.ht_logger import get_ht_logger @@ -9,15 +10,16 @@ class HTSolrAPI: - def __init__(self, url): + def __init__(self, url, user=None, password=None): self.url = url + self.auth = HTTPBasicAuth(user, password) if user and password else None def get_solr_status(self): response = requests.get(self.url) return response def index_document(self, xml_data: dict, content_type: Text = "application/json"): - """Feed a JSON object, create an XML string to index the document into SOLR + """Feed a JSON object, create an XML string to index the document into SOLR "Content-Type": "application/json" """ try: @@ -25,6 +27,7 @@ def index_document(self, xml_data: dict, content_type: Text = "application/json" f"{self.url.replace('#/', '')}update/json/docs", headers={"Content-Type": content_type}, json=xml_data, + auth=self.auth, params={ "commit": "true", }, ) @@ -47,6 +50,7 @@ def index_documents(self, path: Path, list_documents: list = None, solr_url_json response = requests.post( f"{self.url.replace('#/', '')}{solr_url_json}?commit=true", headers=headers, + auth=self.auth, data=data_dict, params={ "commit": "true", diff --git a/ht_indexer_api/ht_indexer_api_test.py b/ht_indexer_api/ht_indexer_api_test.py index 266d24e..cf1e7c2 100644 --- a/ht_indexer_api/ht_indexer_api_test.py +++ b/ht_indexer_api/ht_indexer_api_test.py @@ -1,45 +1,69 @@ import os -from pathlib import Path import pytest +from unittest.mock import MagicMock, patch +from pathlib import Path from ht_indexer_api.ht_indexer_api import HTSolrAPI @pytest.fixture def get_solr_api(): return HTSolrAPI( - url="http://solr-lss-dev:8983/solr/#/core-x/" + "http://solr-lss-dev:8983/solr/#/core-x/" ) @pytest.fixture def get_fake_solr_api(): return HTSolrAPI( - url="http://solr-lss-dev:8983/solr/#/core-not_exist/" + "http://solr-lss-dev:8983/solr/#/core-not_exist/" ) -class TestHTSolrAPI: - def test_connection(self, get_solr_api): +class TestHTSolrAPI(): + + @patch('ht_indexer_api.ht_indexer_api.HTSolrAPI.get_solr_status') + def test_connection(self, mock_solr_status, get_solr_api): """ Check if solr server is running :param get_solrAPI: :return: """ + mock_response = MagicMock() + mock_response.status_code = 200 + + mock_solr_status.return_value = mock_response + solr_api_status = get_solr_api.get_solr_status() assert solr_api_status.status_code == 200 - def test_index_document_add(self, get_solr_api): + @patch('ht_indexer_api.ht_indexer_api.HTSolrAPI.index_documents') + def test_index_document_add(self, mock_index_documents, get_solr_api): + # Arrange + mock_response = MagicMock() + mock_response.status_code = 200 + mock_index_documents.return_value = mock_response + document_path = Path(f"{os.path.dirname(__file__)}/data/add") list_documents = ["39015078560292_solr_full_text.xml"] + + # Act response = get_solr_api.index_documents(document_path, list_documents=list_documents, solr_url_json="update/", headers={"Content-Type": "application/xml"}) + + # Assert assert response.status_code == 200 - def test_query_by_id(self, get_solr_api): - """ + @patch('ht_indexer_api.ht_indexer_api.HTSolrAPI.get_documents') + def test_query_by_id(self, mock_get_documents, get_solr_api): + mock_response = MagicMock() + mock_response.status_code = 200 + mock_response.headers = {"Content-Type": "text/plain;charset=utf-8"} + + mock_get_documents.return_value = mock_response + """ :param get_solrAPI: :return: """ @@ -51,11 +75,17 @@ def test_query_by_id(self, get_solr_api): response.headers["Content-Type"] == "text/plain;charset=utf-8" ) - def test_index_document_delete(self, get_solr_api): + @patch('ht_indexer_api.ht_indexer_api.HTSolrAPI.index_documents') + def test_index_document_delete(self, mock_index_documents, get_solr_api): + mock_response = MagicMock() + mock_response.status_code = 200 + mock_index_documents.return_value = mock_response + document_path = Path( f"{os.path.dirname(__file__)}/data/delete" ) # "data/delete" list_documents = ["39015078560292-1-1-flat.solr_delete.xml"] + response = get_solr_api.index_documents(document_path, list_documents=list_documents, solr_url_json="update/", headers={"Content-Type": "application/xml"}) assert response.status_code == 200 diff --git a/main.py b/main.py index 40e0a67..fc8426a 100755 --- a/main.py +++ b/main.py @@ -1,4 +1,5 @@ import argparse +import os import uvicorn from fastapi import FastAPI @@ -37,7 +38,9 @@ def solr_startup(): logger.info("Connecting with Solr server") global solr_api - solr_api = HTSolrAPI(url=args.solr_url) + solr_user = os.getenv("SOLR_USER") + solr_password = os.getenv("SOLR_PASSWORD") + solr_api = HTSolrAPI(url=args.solr_url, user=solr_user, password=solr_password) @app.get("/ping") def check_solr():