diff --git a/README.md b/README.md index 71cec8b9..09842330 100644 --- a/README.md +++ b/README.md @@ -42,7 +42,7 @@ The `env.json` file contains environment variable values for the lambda function Some of the values can be found as follows: - `API_TOKEN_SECRET` - already defined; value has to exist but doesn't matter in dev mode -- `ELASTICSEARCH_ENDPOINT` - run the following command: +- `OPENSEARCH_ENDPOINT` - run the following command: ``` aws secretsmanager get-secret-value \ --secret-id dev-environment/config/meadow --query SecretString \ @@ -132,6 +132,22 @@ bin/start-with-step aws stepfunctions create-state-machine --endpoint http://localhost:8083 --definition file://state_machines/av_download.json --name "hlsStitcherStepFunction" --role-arn arn:aws:iam::012345678901:role/DummyRole ``` +## Deploying a development branch + +``` +# sam sync --watch will do hot deploys as you make changes. If you don't want this, switch below command to sam sync or deploy + +export STACK_NAME=dc-api-yourdevprefix +export CONFIG_ENV=staging + +sam sync --watch --stack-name $STACK_NAME \ + --config-env $CONFIG_ENV \ + --config-file ./samconfig.toml \ + --parameter-overrides $(while IFS='=' read -r key value; do params+=" $key=$value"; done < ./$CONFIG_ENV.parameters && echo "$params CustomDomainHost=$STACK_NAME") +``` + +This will give you API routes like: `https://dc-api-yourdevprefix.rdc-staging.library.northwestern.edu/chat-endpoint` + ## Deploying the API manually - Symlink the `*.parameters` file you need from `tfvars/dc-api/` to the application root diff --git a/chat/src/event_config.py b/chat/src/event_config.py index 9c9facbc..7794c8e3 100644 --- a/chat/src/event_config.py +++ b/chat/src/event_config.py @@ -20,7 +20,7 @@ K_VALUE = 5 MAX_K = 100 TEMPERATURE = 0.2 -TEXT_KEY = "title" +TEXT_KEY = "id" VERSION = "2023-07-01-preview" @dataclass @@ -63,7 +63,6 @@ def __post_init__(self): self.attributes = self._get_attributes() self.azure_endpoint = self._get_azure_endpoint() self.azure_resource_name = self._get_azure_resource_name() - self.azure_endpoint = self._get_azure_endpoint() self.debug_mode = self._is_debug_mode_enabled() self.deployment_name = self._get_deployment_name() self.is_logged_in = self.api_token.is_logged_in() diff --git a/chat/src/handlers/opensearch_neural_search.py b/chat/src/handlers/opensearch_neural_search.py new file mode 100644 index 00000000..09b59cb2 --- /dev/null +++ b/chat/src/handlers/opensearch_neural_search.py @@ -0,0 +1,88 @@ +from langchain_core.documents import Document +from langchain_core.vectorstores import VectorStore +from opensearchpy import OpenSearch +from typing import Any, List, Tuple + + +class OpenSearchNeuralSearch(VectorStore): + """Read-only OpenSearch vectorstore with neural search.""" + + def __init__( + self, + client: None, + endpoint: str, + index: str, + model_id: str, + vector_field: str = "embedding", + search_pipeline: str = None, + text_field: str = "id", + **kwargs: Any, + ): + self.client = client or OpenSearch( + hosts=[{"host": endpoint, "port": "443", "use_ssl": True}], **kwargs + ) + self.index = index + self.model_id = model_id + self.vector_field = vector_field + self.search_pipeline = search_pipeline + self.text_field = text_field + + def similarity_search( + self, query: str, k: int = 10, subquery: Any = None, **kwargs: Any + ) -> List[Document]: + """Return docs most similar to the embedding vector.""" + docs_with_scores = self.similarity_search_with_score( + query, k, subquery, **kwargs + ) + return [doc[0] for doc in docs_with_scores] + + def similarity_search_with_score( + self, query: str, k: int = 10, subquery: Any = None, **kwargs: Any + ) -> List[Tuple[Document, float]]: + """Return docs most similar to query.""" + dsl = { + "size": k, + "query": { + "hybrid": { + "queries": [ + { + "neural": { + self.vector_field: { + "query_text": query, + "model_id": self.model_id, + "k": k, + } + } + } + ] + } + }, + } + + if subquery: + dsl["query"]["hybrid"]["queries"].append(subquery) + + for key, value in kwargs.items(): + dsl[key] = value + + response = self.client.search(index=self.index, body=dsl, params={"search_pipeline": self.search_pipeline} if self.search_pipeline else None) + + documents_with_scores = [ + ( + Document( + page_content=hit["_source"][self.text_field], + metadata=(hit["_source"]), + ), + hit["_score"], + ) + for hit in response["hits"]["hits"] + ] + + return documents_with_scores + + def add_texts(self, texts: List[str], metadatas: List[dict], **kwargs: Any) -> None: + pass + + @classmethod + def from_texts(cls, texts: List[str], metadatas: List[dict], **kwargs: Any) -> None: + pass \ No newline at end of file diff --git a/chat/src/helpers/prompts.py b/chat/src/helpers/prompts.py index 397b7005..ef4fb5b5 100644 --- a/chat/src/helpers/prompts.py +++ b/chat/src/helpers/prompts.py @@ -2,10 +2,11 @@ def prompt_template() -> str: - return """Please answer the question based on the documents provided, and include some details about why the documents might be relevant to the particular question: + return """Please answer the question based on the documents provided, and include some details about why the documents might be relevant to the particular question. The 'title' field is the document title, and the 'source' field is a UUID that uniquely identifies each document: Documents: {context} +Format the answer as raw markdown. Insert links when referencing documents by title using it's UUID, as in the following guide: [title](https://dc.library.northwestern.edu/items/UUID). Example: [Judy Collins, Jackson Hole Folk Festival](https://dc.library.northwestern.edu/items/f1ca513b-7d13-4af6-ad7b-8c7ffd1d3a37). Suggest keywords searches using the following guide (example: [jazz musicians](https://dc.library.northwestern.edu/search?q=Jazz+musicians)). Offer search terms that vary in scope, highlight specific individuals or groups, or delve deeper into a topic. Remember to include as many direct links to Digital Collections searches as needed for comprehensive study. The `collection` field contains information about the collection the document belongs to. When many of the documents are from the same collection, mention the collection and link to the collection using the collection title and id: [collection['title']](https://dc.library.northwestern.edu/collections/collection['id']), for example [World War II Poster Collection](https://dc.library.northwestern.edu/collections/faf4f60e-78e0-4fbf-96ce-4ca8b4df597a): Question: {question} diff --git a/chat/src/helpers/response.py b/chat/src/helpers/response.py index a3b946d4..351be8ff 100644 --- a/chat/src/helpers/response.py +++ b/chat/src/helpers/response.py @@ -48,8 +48,9 @@ def extract_prompt_value(v): def prepare_response(config): try: + subquery = {"match": {"all_text": {"query": config.question}}} docs = config.opensearch.similarity_search( - config.question, k=config.k, vector_field="embedding", text_field="id" + query=config.question, k=config.k, subquery=subquery, _source={"excludes": ["embedding"]} ) original_question = get_and_send_original_question(config, docs) response = config.chain({"question": config.question, "input_documents": docs}) diff --git a/chat/src/setup.py b/chat/src/setup.py index 39a99338..ba3cb72c 100644 --- a/chat/src/setup.py +++ b/chat/src/setup.py @@ -1,7 +1,5 @@ -from content_handler import ContentHandler from langchain_community.chat_models import AzureChatOpenAI -from langchain_community.embeddings import SagemakerEndpointEmbeddings -from langchain_community.vectorstores import OpenSearchVectorSearch +from handlers.opensearch_neural_search import OpenSearchNeuralSearch from opensearchpy import OpenSearch, RequestsHttpConnection from requests_aws4auth import AWS4Auth import os @@ -22,7 +20,7 @@ def opensearch_client(region_name=os.getenv("AWS_REGION")): print(region_name) session = boto3.Session(region_name=region_name) awsauth = AWS4Auth(region=region_name, service="es", refreshable_credentials=session.get_credentials()) - endpoint = os.getenv("ELASTICSEARCH_ENDPOINT") + endpoint = os.getenv("OPENSEARCH_ENDPOINT") return OpenSearch( hosts=[{'host': endpoint, 'port': 443}], @@ -35,20 +33,14 @@ def opensearch_vector_store(region_name=os.getenv("AWS_REGION")): session = boto3.Session(region_name=region_name) awsauth = AWS4Auth(region=region_name, service="es", refreshable_credentials=session.get_credentials()) - sagemaker_client = session.client(service_name="sagemaker-runtime", region_name=session.region_name) - embeddings = SagemakerEndpointEmbeddings( - client=sagemaker_client, - region_name=session.region_name, - endpoint_name=os.getenv("EMBEDDING_ENDPOINT"), - content_handler=ContentHandler() - ) - - docsearch = OpenSearchVectorSearch( - index_name=prefix("dc-v2-work"), - embedding_function=embeddings, - opensearch_url="https://" + os.getenv("ELASTICSEARCH_ENDPOINT"), + docsearch = OpenSearchNeuralSearch( + index=prefix("dc-v2-work"), + model_id=os.getenv("OPENSEARCH_MODEL_ID"), + endpoint=os.getenv("OPENSEARCH_ENDPOINT"), connection_class=RequestsHttpConnection, http_auth=awsauth, + search_pipeline=prefix("dc-v2-work-pipeline"), + text_field= "id" ) return docsearch diff --git a/chat/template.yaml b/chat/template.yaml index d7696246..24b95aac 100644 --- a/chat/template.yaml +++ b/chat/template.yaml @@ -8,21 +8,22 @@ Parameters: AzureOpenaiApiKey: Type: String Description: Azure OpenAI API Key - AzureOpenaiEmbeddingDeploymentId: - Type: String - Description: Azure OpenAI Embedding Deployment ID AzureOpenaiLlmDeploymentId: Type: String Description: Azure OpenAI LLM Deployment ID AzureOpenaiResourceName: Type: String Description: Azure OpenAI Resource Name - ElasticsearchEndpoint: + EnvironmentPrefix: + Type: String + Description: Prefix for Index names + Default: "" + OpenSearchEndpoint: Type: String - Description: Elasticsearch URL - EmbeddingEndpoint: + Description: OpenSearch Endpoint + OpenSearchModelId: Type: String - Description: Sagemaker Inference Endpoint + Description: OpenSearch Model ID Resources: ApiGwAccountConfig: Type: "AWS::ApiGateway::Account" @@ -199,11 +200,11 @@ Resources: Variables: API_TOKEN_SECRET: !Ref ApiTokenSecret AZURE_OPENAI_API_KEY: !Ref AzureOpenaiApiKey - AZURE_OPENAI_EMBEDDING_DEPLOYMENT_ID: !Ref AzureOpenaiEmbeddingDeploymentId AZURE_OPENAI_LLM_DEPLOYMENT_ID: !Ref AzureOpenaiLlmDeploymentId AZURE_OPENAI_RESOURCE_NAME: !Ref AzureOpenaiResourceName - ELASTICSEARCH_ENDPOINT: !Ref ElasticsearchEndpoint - EMBEDDING_ENDPOINT: !Ref EmbeddingEndpoint + ENV_PREFIX: !Ref EnvironmentPrefix + OPENSEARCH_ENDPOINT: !Ref OpenSearchEndpoint + OPENSEARCH_MODEL_ID: !Ref OpenSearchModelId Policies: - Statement: - Effect: Allow @@ -217,12 +218,6 @@ Resources: - 'es:ESHttpGet' - 'es:ESHttpPost' Resource: '*' - - Statement: - - Effect: Allow - Action: - - 'sagemaker:InvokeEndpoint' - - 'sagemaker:InvokeEndpointAsync' - Resource: !Sub 'arn:aws:sagemaker:${AWS::Region}:${AWS::AccountId}:endpoint/${EmbeddingEndpoint}' Metadata: BuildMethod: nodejs18.x Deployment: diff --git a/chat/test/handlers/test_opensearch_neural_search.py b/chat/test/handlers/test_opensearch_neural_search.py new file mode 100644 index 00000000..d7448679 --- /dev/null +++ b/chat/test/handlers/test_opensearch_neural_search.py @@ -0,0 +1,43 @@ +# ruff: noqa: E402 +import sys +sys.path.append('./src') + +from unittest import TestCase +from handlers.opensearch_neural_search import OpenSearchNeuralSearch +from langchain_core.documents import Document + +class MockClient(): + def search(self, index, body, params): + return { + "hits": { + "hits": [ + { + "_source": { + "id": "test" + }, + "_score": 0.12345 + } + ] + } + } + +class TestOpenSearchNeuralSearch(TestCase): + def test_similarity_search(self): + docs = OpenSearchNeuralSearch(client=MockClient(), endpoint="test", index="test", model_id="test").similarity_search(query="test", subquery={"_source": {"excludes": ["embedding"]}}, size=10) + self.assertEqual(docs, [Document(page_content='test', metadata={'id': 'test'})]) + + def test_similarity_search_with_score(self): + docs = OpenSearchNeuralSearch(client=MockClient(), endpoint="test", index="test", model_id="test").similarity_search_with_score(query="test") + self.assertEqual(docs, [(Document(page_content='test', metadata={'id': 'test'}), 0.12345)]) + + def test_add_texts(self): + try: + OpenSearchNeuralSearch(client=MockClient(), endpoint="test", index="test", model_id="test").add_texts(texts=["test"], metadatas=[{"id": "test"}]) + except Exception as e: + self.fail(f"from_texts raised an exception: {e}") + + def test_from_texts(self): + try: + OpenSearchNeuralSearch.from_texts(clas="test", texts=["test"], metadatas=[{"id": "test"}]) + except Exception as e: + self.fail(f"from_texts raised an exception: {e}") \ No newline at end of file diff --git a/chat/test/helpers/test_metrics.py b/chat/test/helpers/test_metrics.py index 651043eb..efab07cd 100644 --- a/chat/test/helpers/test_metrics.py +++ b/chat/test/helpers/test_metrics.py @@ -48,7 +48,7 @@ def test_token_usage(self): expected_result = { "answer": 6, - "prompt": 36, + "prompt": 328, "question": 15, "source_documents": 1, } diff --git a/chat/test/test_event_config.py b/chat/test/test_event_config.py index 55f8381d..1be422d5 100644 --- a/chat/test/test_event_config.py +++ b/chat/test/test_event_config.py @@ -57,7 +57,7 @@ def test_attempt_override_without_superuser_status(self): "question": "test question", "ref": "test ref", "temperature": 0.2, - "text_key": "title", + "text_key": "id", } self.assertEqual(actual.azure_endpoint, expected_output["azure_endpoint"]) self.assertEqual(actual.attributes, expected_output["attributes"]) diff --git a/dev/env.json b/dev/env.json index ffcaadf5..8daeb73f 100644 --- a/dev/env.json +++ b/dev/env.json @@ -1,7 +1,7 @@ { "Parameters": { "API_TOKEN_SECRET": "DEVELOPMENT_SECRET", - "ELASTICSEARCH_ENDPOINT": "", + "OPENSEARCH_ENDPOINT": "", "ENV_PREFIX": "", "DC_URL": "" } diff --git a/node/src/api/opensearch.js b/node/src/api/opensearch.js index 82926223..9c6d98bb 100644 --- a/node/src/api/opensearch.js +++ b/node/src/api/opensearch.js @@ -1,6 +1,6 @@ const { HttpRequest } = require("@aws-sdk/protocol-http"); const { awsFetch } = require("../aws/fetch"); -const { elasticsearchEndpoint, prefix } = require("../environment"); +const { openSearchEndpoint, prefix } = require("../environment"); const Honeybadger = require("../honeybadger-setup"); async function getCollection(id, opts) { @@ -65,7 +65,7 @@ function isVisible(doc, { allowPrivate, allowUnpublished }) { } function initRequest(path) { - const endpoint = elasticsearchEndpoint(); + const endpoint = openSearchEndpoint(); return new HttpRequest({ method: "GET", @@ -80,7 +80,7 @@ function initRequest(path) { async function search(targets, body, optionsQuery = {}) { Honeybadger.addBreadcrumb("Searching", { metadata: { targets, body } }); - const endpoint = elasticsearchEndpoint(); + const endpoint = openSearchEndpoint(); const request = new HttpRequest({ method: "POST", @@ -98,7 +98,7 @@ async function search(targets, body, optionsQuery = {}) { } async function scroll(scrollId) { - const endpoint = elasticsearchEndpoint(); + const endpoint = openSearchEndpoint(); const request = new HttpRequest({ method: "POST", @@ -114,7 +114,7 @@ async function scroll(scrollId) { } async function deleteScroll(scrollId) { - const endpoint = elasticsearchEndpoint(); + const endpoint = openSearchEndpoint(); const request = new HttpRequest({ method: "DELETE", diff --git a/node/src/environment.js b/node/src/environment.js index 3443075c..44958569 100644 --- a/node/src/environment.js +++ b/node/src/environment.js @@ -40,8 +40,8 @@ function dcUrl() { return process.env.DC_URL; } -function elasticsearchEndpoint() { - return process.env.ELASTICSEARCH_ENDPOINT; +function openSearchEndpoint() { + return process.env.OPENSEARCH_ENDPOINT; } function prefix(value) { @@ -61,7 +61,7 @@ module.exports = { appInfo, dcApiEndpoint, dcUrl, - elasticsearchEndpoint, + openSearchEndpoint, prefix, region, }; diff --git a/node/test/test-helpers/index.js b/node/test/test-helpers/index.js index 8045b1ed..84e4afa3 100644 --- a/node/test/test-helpers/index.js +++ b/node/test/test-helpers/index.js @@ -46,7 +46,7 @@ function mockIndex() { const mock = nock("https://index.test.library.northwestern.edu"); beforeEach(function () { - process.env.ELASTICSEARCH_ENDPOINT = "index.test.library.northwestern.edu"; + process.env.OPENSEARCH_ENDPOINT = "index.test.library.northwestern.edu"; }); afterEach(function () { diff --git a/node/test/unit/aws/environment.test.js b/node/test/unit/aws/environment.test.js index 304aaf57..b3391e1f 100644 --- a/node/test/unit/aws/environment.test.js +++ b/node/test/unit/aws/environment.test.js @@ -9,8 +9,8 @@ describe("environment", function () { helpers.saveEnvironment(); it("returns the index endpoint", function () { - process.env.ELASTICSEARCH_ENDPOINT = "index.test.library.northwestern.edu"; - expect(environment.elasticsearchEndpoint()).to.eq( + process.env.OPENSEARCH_ENDPOINT = "index.test.library.northwestern.edu"; + expect(environment.openSearchEndpoint()).to.eq( "index.test.library.northwestern.edu" ); }); diff --git a/template.yaml b/template.yaml index 9e4184d5..91c538fc 100644 --- a/template.yaml +++ b/template.yaml @@ -19,7 +19,7 @@ Globals: API_TOKEN_SECRET: !Ref ApiTokenSecret DC_API_ENDPOINT: !Ref DcApiEndpoint DC_URL: !Ref DcUrl - ELASTICSEARCH_ENDPOINT: !Ref ElasticsearchEndpoint + OPENSEARCH_ENDPOINT: !Ref OpenSearchEndpoint ENV_PREFIX: !Ref EnvironmentPrefix HONEYBADGER_API_KEY: !Ref HoneybadgerApiKey HONEYBADGER_ENV: !Ref HoneybadgerEnv @@ -35,9 +35,6 @@ Parameters: AzureOpenaiApiKey: Type: String Description: Azure OpenAI API Key - AzureOpenaiEmbeddingDeploymentId: - Type: String - Description: Azure OpenAI Embedding Deployment ID AzureOpenaiLlmDeploymentId: Type: String Description: Azure OpenAI LLM Deployment ID @@ -59,12 +56,12 @@ Parameters: DcUrl: Type: String Description: URL of Digital Collections website - ElasticsearchEndpoint: + OpenSearchModelId: Type: String - Description: Elasticsearch url - EmbeddingEndpoint: + Description: OpenSearch Model ID + OpenSearchEndpoint: Type: String - Description: Sagemaker Inference Endpoint + Description: OpenSearch endpoint EnvironmentPrefix: Type: String Description: Index Prefix @@ -653,11 +650,11 @@ Resources: Parameters: ApiTokenSecret: !Ref ApiTokenSecret AzureOpenaiApiKey: !Ref AzureOpenaiApiKey - AzureOpenaiEmbeddingDeploymentId: !Ref AzureOpenaiEmbeddingDeploymentId AzureOpenaiLlmDeploymentId: !Ref AzureOpenaiLlmDeploymentId AzureOpenaiResourceName: !Ref AzureOpenaiResourceName - ElasticsearchEndpoint: !Ref ElasticsearchEndpoint - EmbeddingEndpoint: !Ref EmbeddingEndpoint + EnvironmentPrefix: !Ref EnvironmentPrefix + OpenSearchEndpoint: !Ref OpenSearchEndpoint + OpenSearchModelId: !Ref OpenSearchModelId chatWebsocketEndpoint: Type: AWS::Serverless::Function Properties: