From f0b72b26cb2a538b00af3a1f45826822f1a1077c Mon Sep 17 00:00:00 2001 From: Lianet Sepulveda Torres Date: Thu, 19 Sep 2024 06:54:09 -0400 Subject: [PATCH] Indexer service use Solr authentication to index documents & unites changed to avoid Solr authentication Profiles have been added to the docker-compose to start and test each of the service (retriever, generator, indexer) indendently, but I removed because it did not work well with the github actions & The Solr service has been updated to accept authentication and created the collection. Update README.md --- .github/workflows/tests.yaml | 2 +- README.md | 30 ++++- docker-compose.yml | 121 +++++++++++------- document_indexer_service/indexer_arguments.py | 7 +- env.example | 2 + ht_indexer_api/ht_indexer_api.py | 8 +- ht_indexer_api/ht_indexer_api_test.py | 48 +++++-- init_ht_indexer.sh | 23 ++++ main.py | 5 +- 9 files changed, 181 insertions(+), 65 deletions(-) create mode 100644 env.example create mode 100755 init_ht_indexer.sh diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml index 8141540..ae6fe52 100644 --- a/.github/workflows/tests.yaml +++ b/.github/workflows/tests.yaml @@ -20,4 +20,4 @@ jobs: docker compose build - name: Run tests - run: docker compose run test + run: docker compose run all_tests \ No newline at end of file diff --git a/README.md b/README.md index 47e8576..a79b0db 100644 --- a/README.md +++ b/README.md @@ -71,12 +71,26 @@ systems involved in the flow to index documents in Full-text search index. The q In your workdir: 1. Clone the repository: + ```git clone git@github.com:hathitrust/ht_indexer.git``` 2. Go to the folder ``cd ht_indexer`` 3. Create the image + `docker build -t document_generator .` -4. Run the container - `docker compose up document_retriever -d` + +4. Run the services + + 1. Retriever service + + `docker compose up document_retriever -d` + + 2. Generator service + + `docker compose up document_generator -d` + + 3. Indexer service + + `docker compose up document_indexer -d` If you want to run the application in your local environment and outside the docker container, you should follow the steps mentioned in the section [How to set up your python environment](#project-set-up-local-environment) @@ -172,7 +186,7 @@ A message can fail for different reasons: We use a **dead-letter-exchange** to handle messages that are not processed successfully. The dead-letter-exchange is an exchange to which messages will be re-routed if they are rejected by the queue. In the current logic, all the service -using the queue system has a dead-letter-exchange associated with it. One of our future steps is to figure out what we +using the queue system has a dead-letter-exschange associated with itve One of our future steps is to figure out what we will do with the messages in the dead-letter-exchange. @@ -308,9 +322,15 @@ In the working directory, * Run document_indexer_service container and test it +Solr server required authentication, so you should set up the environment variables SOLR_USER and SOLR_PASSWORD before +starting the container. All the users (solr, admin and fulltext) use the same solr password (solrRocks) + +export SOLR_USER=admin +export SOLR_PASSWORD=solrRocks + ```docker compose up document_indexer -d``` -```docker compose exec document_indexer pytest ht_indexer_api ht_queue_service``` +```docker compose exec document_indexer pytest document_indexer_service ht_indexer_api ht_queue_service``` ## Hosting @@ -363,7 +383,7 @@ In the image below, you can see the main kubernetes parts running in this workfl * Document indexer ``` python document_indexer_service/document_indexer_service.py - --solr_indexing_api http://solr8-embedded-zookeeper:8983/solr/#/core-x/ + --solr_indexing_api http://fulltext-workshop-solrcloud-headless:8983/solr/core-x/ ``` * In Kubernetes, you can also use the script `run_retriever_processor_kubernetes.sh` to run the services to retrieve diff --git a/docker-compose.yml b/docker-compose.yml index 1ec0f95..c896b40 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1,5 +1,3 @@ -version: "3" - services: document_retriever: container_name: document_retriever @@ -11,12 +9,12 @@ services: ports: - "8081:8081" environment: - - SOLR_URL=http://solr-sdr-catalog:9033/solr/#/catalog/ - - SDR_DIR=/sdr1/obj - - QUEUE_HOST=rabbitmq - - QUEUE_NAME=retriever_queue - - QUEUE_PASS=guest - - QUEUE_USER=guest + SOLR_URL: http://solr-sdr-catalog:9033/solr/#/catalog/ + SDR_DIR: /sdr1/obj + QUEUE_HOST: rabbitmq + QUEUE_NAME: retriever_queue + QUEUE_PASS: guest + QUEUE_USER: guest depends_on: solr-sdr-catalog: condition: service_healthy @@ -32,11 +30,11 @@ services: ports: - "3306:3306" environment: - - MYSQL_HOST=mysql-sdr - - MYSQL_USER=mdp-lib - - MYSQL_PASS=mdp-lib - - MYSQL_DATABASE=ht - - MYSQL_RANDOM_ROOT_PASSWORD=1 + MYSQL_HOST: mysql-sdr + MYSQL_USER: mdp-lib + MYSQL_PASS: mdp-lib + MYSQL_DATABASE: ht + MYSQL_RANDOM_ROOT_PASSWORD: 1 healthcheck: interval: 30s retries: 3 @@ -49,18 +47,46 @@ services: ] timeout: 30s solr-lss-dev: - image: ghcr.io/hathitrust/full-text-search-embedded_zoo:example-8.11 + image: ghcr.io/hathitrust/full-text-search-cloud:shards-docker container_name: solr-lss-dev ports: - "8983:8983" + environment: + ZK_HOST: zoo1:2181 + SOLR_OPTS: -XX:-UseLargePages + SOLR_USER: solr + SOLR_PASSWORD: 'solrRocks' + depends_on: + zoo1: + condition: service_healthy volumes: - - solr_data:/var/solr/data - command: solr-foreground -c + - solr1_data:/var/solr/data + # start solr in the background, wait for it to start, then create the collection + command: [ "sh", "-c", 'solr-foreground -c & sleep 150 && export SOLR_AUTHENTICATION_OPTS=-Dbasicauth="$SOLR_USER":"$SOLR_PASSWORD" && solr create_collection -d /opt/solr/core-x -c core-x -shards 1 -replicationFactor 1 -p 8983 && wait' ] healthcheck: - test: [ "CMD-SHELL", "solr healthcheck -c core-x" ] - interval: 5s + test: [ "CMD-SHELL", "solr healthcheck -c core-x || echo 'Healthcheck failed'" ] + interval: 30s + timeout: 10s + retries: 5 + zoo1: + image: zookeeper:3.8.0 + container_name: zoo1 + restart: always + hostname: zoo1 + ports: + - 2181:2181 + - 7001:7000 + environment: + ZOO_MY_ID: 1 + ZOO_SERVERS: server.1=zoo1:2888:3888;2181 + ZOO_4LW_COMMANDS_WHITELIST: mntr, conf, ruok + ZOO_CFG_EXTRA: "metricsProvider.className=org.apache.zookeeper.metrics.prometheus.PrometheusMetricsProvider metricsProvider.httpPort=7000 metricsProvider.exportJvmInfo=true" + volumes: + - zoo1_data:/data + healthcheck: + test: [ "CMD", "echo", "ruok", "|", "nc", "localhost", "2181", "|", "grep", "imok" ] + interval: 30s timeout: 10s - start_period: 30s retries: 5 solr-sdr-catalog: container_name: solr-sdr-catalog @@ -75,20 +101,6 @@ services: - "9033:9033" expose: - 9033 - test: - container_name: indexing_test - build: . - volumes: - - .:/app - - ../tmp:/tmp - command: [ "pytest" ] - depends_on: - solr-sdr-catalog: - condition: service_healthy - solr-lss-dev: - condition: service_healthy - rabbitmq: - condition: service_healthy document_generator: container_name: document_generator image: document_generator @@ -103,14 +115,14 @@ services: tty: true stdin_open: true environment: - - SRC_QUEUE_HOST=rabbitmq - - SRC_QUEUE_NAME=retriever_queue - - SRC_QUEUE_PASS=guest - - SRC_QUEUE_USER=guest - - TGT_QUEUE_HOST=rabbitmq - - TGT_QUEUE_NAME=indexer_queue - - TGT_QUEUE_PASS=guest - - TGT_QUEUE_USER=guest + SRC_QUEUE_HOST: rabbitmq + SRC_QUEUE_NAME: retriever_queue + SRC_QUEUE_PASS: guest + SRC_QUEUE_USER: guest + TGT_QUEUE_HOST: rabbitmq + TGT_QUEUE_NAME: indexer_queue + TGT_QUEUE_PASS: guest + TGT_QUEUE_USER: guest command: [ "python", "document_generator/document_generator_service.py" ] document_indexer: container_name: document_indexer @@ -127,10 +139,12 @@ services: tty: true stdin_open: true environment: - - QUEUE_HOST=rabbitmq - - QUEUE_NAME=indexer_queue - - QUEUE_PASS=guest - - QUEUE_USER=guest + QUEUE_HOST: rabbitmq + QUEUE_NAME: indexer_queue + QUEUE_PASS: guest + QUEUE_USER: guest + SOLR_USER: solr + SOLR_PASSWORD: 'solrRocks' command: [ "python", "document_indexer_service/document_indexer_service.py", "--solr_indexing_api", "http://solr-lss-dev:8983/solr/#/core-x/" ] rabbitmq: container_name: rabbitmq @@ -147,7 +161,22 @@ services: timeout: 10s start_period: 30s retries: 5 + all_tests: + container_name: all_tests + build: . + volumes: + - .:/app + - ../tmp:/tmp + command: [ "pytest" ] + depends_on: + solr-sdr-catalog: + condition: service_healthy + solr-lss-dev: + condition: service_healthy + rabbitmq: + condition: service_healthy volumes: - solr_data: mysql_sdr_data: + solr1_data: null + zoo1_data: null diff --git a/document_indexer_service/indexer_arguments.py b/document_indexer_service/indexer_arguments.py index 163f4ad..088f250 100644 --- a/document_indexer_service/indexer_arguments.py +++ b/document_indexer_service/indexer_arguments.py @@ -29,7 +29,12 @@ def __init__(self, parser): self.args = parser.parse_args() - self.solr_api_full_text = HTSolrAPI(url=self.args.solr_indexing_api) + solr_user = os.getenv("SOLR_USER") + solr_password = os.getenv("SOLR_PASSWORD") + + self.solr_api_full_text = HTSolrAPI(url=self.args.solr_indexing_api, + user=solr_user, + password=solr_password) self.document_local_path = self.args.document_local_path diff --git a/env.example b/env.example new file mode 100644 index 0000000..8d62770 --- /dev/null +++ b/env.example @@ -0,0 +1,2 @@ +SOLR_USER=solr +SOLR_PASSWORD=solrRocks \ No newline at end of file diff --git a/ht_indexer_api/ht_indexer_api.py b/ht_indexer_api/ht_indexer_api.py index 9a6955f..9caea7d 100644 --- a/ht_indexer_api/ht_indexer_api.py +++ b/ht_indexer_api/ht_indexer_api.py @@ -2,6 +2,7 @@ from typing import Text import requests +from requests.auth import HTTPBasicAuth from ht_utils.ht_logger import get_ht_logger @@ -9,15 +10,16 @@ class HTSolrAPI: - def __init__(self, url): + def __init__(self, url, user=None, password=None): self.url = url + self.auth = HTTPBasicAuth(user, password) if user and password else None def get_solr_status(self): response = requests.get(self.url) return response def index_document(self, xml_data: dict, content_type: Text = "application/json"): - """Feed a JSON object, create an XML string to index the document into SOLR + """Feed a JSON object, create an XML string to index the document into SOLR "Content-Type": "application/json" """ try: @@ -25,6 +27,7 @@ def index_document(self, xml_data: dict, content_type: Text = "application/json" f"{self.url.replace('#/', '')}update/json/docs", headers={"Content-Type": content_type}, json=xml_data, + auth=self.auth, params={ "commit": "true", }, ) @@ -47,6 +50,7 @@ def index_documents(self, path: Path, list_documents: list = None, solr_url_json response = requests.post( f"{self.url.replace('#/', '')}{solr_url_json}?commit=true", headers=headers, + auth=self.auth, data=data_dict, params={ "commit": "true", diff --git a/ht_indexer_api/ht_indexer_api_test.py b/ht_indexer_api/ht_indexer_api_test.py index 266d24e..cf1e7c2 100644 --- a/ht_indexer_api/ht_indexer_api_test.py +++ b/ht_indexer_api/ht_indexer_api_test.py @@ -1,45 +1,69 @@ import os -from pathlib import Path import pytest +from unittest.mock import MagicMock, patch +from pathlib import Path from ht_indexer_api.ht_indexer_api import HTSolrAPI @pytest.fixture def get_solr_api(): return HTSolrAPI( - url="http://solr-lss-dev:8983/solr/#/core-x/" + "http://solr-lss-dev:8983/solr/#/core-x/" ) @pytest.fixture def get_fake_solr_api(): return HTSolrAPI( - url="http://solr-lss-dev:8983/solr/#/core-not_exist/" + "http://solr-lss-dev:8983/solr/#/core-not_exist/" ) -class TestHTSolrAPI: - def test_connection(self, get_solr_api): +class TestHTSolrAPI(): + + @patch('ht_indexer_api.ht_indexer_api.HTSolrAPI.get_solr_status') + def test_connection(self, mock_solr_status, get_solr_api): """ Check if solr server is running :param get_solrAPI: :return: """ + mock_response = MagicMock() + mock_response.status_code = 200 + + mock_solr_status.return_value = mock_response + solr_api_status = get_solr_api.get_solr_status() assert solr_api_status.status_code == 200 - def test_index_document_add(self, get_solr_api): + @patch('ht_indexer_api.ht_indexer_api.HTSolrAPI.index_documents') + def test_index_document_add(self, mock_index_documents, get_solr_api): + # Arrange + mock_response = MagicMock() + mock_response.status_code = 200 + mock_index_documents.return_value = mock_response + document_path = Path(f"{os.path.dirname(__file__)}/data/add") list_documents = ["39015078560292_solr_full_text.xml"] + + # Act response = get_solr_api.index_documents(document_path, list_documents=list_documents, solr_url_json="update/", headers={"Content-Type": "application/xml"}) + + # Assert assert response.status_code == 200 - def test_query_by_id(self, get_solr_api): - """ + @patch('ht_indexer_api.ht_indexer_api.HTSolrAPI.get_documents') + def test_query_by_id(self, mock_get_documents, get_solr_api): + mock_response = MagicMock() + mock_response.status_code = 200 + mock_response.headers = {"Content-Type": "text/plain;charset=utf-8"} + + mock_get_documents.return_value = mock_response + """ :param get_solrAPI: :return: """ @@ -51,11 +75,17 @@ def test_query_by_id(self, get_solr_api): response.headers["Content-Type"] == "text/plain;charset=utf-8" ) - def test_index_document_delete(self, get_solr_api): + @patch('ht_indexer_api.ht_indexer_api.HTSolrAPI.index_documents') + def test_index_document_delete(self, mock_index_documents, get_solr_api): + mock_response = MagicMock() + mock_response.status_code = 200 + mock_index_documents.return_value = mock_response + document_path = Path( f"{os.path.dirname(__file__)}/data/delete" ) # "data/delete" list_documents = ["39015078560292-1-1-flat.solr_delete.xml"] + response = get_solr_api.index_documents(document_path, list_documents=list_documents, solr_url_json="update/", headers={"Content-Type": "application/xml"}) assert response.status_code == 200 diff --git a/init_ht_indexer.sh b/init_ht_indexer.sh new file mode 100755 index 0000000..b53b096 --- /dev/null +++ b/init_ht_indexer.sh @@ -0,0 +1,23 @@ +#!/usr/bin/env bash + +echo "🚢 Build docker images" + +docker build -t document_generator . + +echo "🚢 Run document_retriever" +docker compose up document_retriever -d + +echo "🌎 Run document_retriever test" +docker compose exec document_retriever pytest document_retriever_service catalog_metadata ht_utils + +echo "🚢 Run document_generator" +docker compose up document_generator -d + +echo "🌎 Run document_generator test" +docker compose exec document_generator pytest document_generator ht_document ht_queue_service ht_utils + +echo "🚢 Run document_indexer" +docker compose up document_indexer -d + +echo "🌎 Run document_indexer test" +docker compose exec document_indexer pytest document_indexer_service ht_indexer_api ht_queue_service \ No newline at end of file diff --git a/main.py b/main.py index 40e0a67..fc8426a 100755 --- a/main.py +++ b/main.py @@ -1,4 +1,5 @@ import argparse +import os import uvicorn from fastapi import FastAPI @@ -37,7 +38,9 @@ def solr_startup(): logger.info("Connecting with Solr server") global solr_api - solr_api = HTSolrAPI(url=args.solr_url) + solr_user = os.getenv("SOLR_USER") + solr_password = os.getenv("SOLR_PASSWORD") + solr_api = HTSolrAPI(url=args.solr_url, user=solr_user, password=solr_password) @app.get("/ping") def check_solr():