feat: refactor ai-assistant plugin and add vector search support (pin…

…terest#1325) * feat: use websocket for ai assistant (pinterest#1311) * feat: use websocket for ai assistant * fix node test * comments * remove memory and add keep button * fix linter * feat: add embedding based table search support (pinterest#1314) * feat: add embedding based table search support * update * build fail * linter * test failure * comments * nodetest * opensearch volumne path * docs: ai assistant plugin (pinterest#1323) * feat: add vector table search (pinterest#1322) * feat: add vector table search * fix linter * remove realtime record query cell * handle table select exceptions * add public config * comments * remove unused config * update var name metatore_id as metastoreId * comments
arrowtail-precision · Jan 3, 2024 · beda6e3 · beda6e3
1 parent 9964555
commit beda6e3
Show file tree

Hide file tree

Showing 74 changed files with 2,143 additions and 850 deletions.
diff --git a/containers/bundled_querybook_config.yaml b/containers/bundled_querybook_config.yaml
@@ -3,6 +3,42 @@ FLASK_SECRET_KEY: SOME_RANDOM_SECRET_KEY
 
 DATABASE_CONN: mysql+pymysql://test:passw0rd@mysql:3306/querybook2?charset=utf8mb4
 REDIS_URL: redis://redis:6379/0
-ELASTICSEARCH_HOST: elasticsearch:9200
+ELASTICSEARCH_HOST: http://elasticsearch:9200
+# ELASTICSEARCH_CONNECTION_TYPE: aws
 # Uncomment for email
 # EMAILER_CONN: dockerhostforward
+
+# Uncomment below to enable AI Assistant for local development.
+# AI_ASSISTANT_PROVIDER: openai
+# AI_ASSISTANT_CONFIG:
+#     default:
+#         model_args:
+#             model_name: gpt-3.5-turbo-16k
+#             temperature: 0
+#         context_length: 16384
+#         reserved_tokens: 2048
+#     table_summary:
+#         model_args:
+#             model_name: gpt-3.5-turbo-16k
+#             temperature: 0
+#         context_length: 16384
+#     query_summary:
+#         model_args:
+#             model_name: gpt-3.5-turbo-16k
+#             temperature: 0
+#         context_length: 16384
+#     table_select:
+#         model_args:
+#             model_name: gpt-3.5-turbo-16k
+#             temperature: 0
+#         context_length: 16384
+
+# Uncomment below to enable vector store to support embedding based table search.
+# Please check langchain doc for the configs of each provider.
+# EMBEDDINGS_PROVIDER: openai
+# EMBEDDINGS_CONFIG: ~
+# VECTOR_STORE_PROVIDER: opensearch
+# VECTOR_STORE_CONFIG:
+#     embeddings_arg_name: 'embedding_function'
+#     opensearch_url: http://elasticsearch:9200
+#     index_name: 'vector_index_v1'
diff --git a/docker-compose.yml b/docker-compose.yml
@@ -99,12 +99,13 @@ services:
         command: ['--character-set-server=utf8mb4', '--collation-server=utf8mb4_unicode_ci']
     elasticsearch:
         container_name: querybook_elasticsearch
-        image: docker.elastic.co/elasticsearch/elasticsearch:7.16.2
+        image: opensearchproject/opensearch:2.9.0
         environment:
             cluster.name: docker-cluster
             bootstrap.memory_lock: 'true'
             discovery.type: single-node
-            ES_JAVA_OPTS: -Xms750m -Xmx750m
+            plugins.security.disabled: true
+            OPENSEARCH_JAVA_OPTS: -Xms750m -Xmx750m
         ulimits:
             memlock:
                 soft: -1
@@ -113,7 +114,7 @@ services:
                 soft: 65536
                 hard: 65536
         volumes:
-            - esdata1:/usr/share/elasticsearch/data
+            - osdata1:/usr/share/opensearch/data
         ports:
             - 9200:9200
         healthcheck:
@@ -161,7 +162,7 @@ services:
 
 volumes:
     my-db:
-    esdata1:
+    osdata1:
         driver: local
     # file:
     #     driver: local

diff --git a/docs_website/docs/integrations/add_ai_assistant.md b/docs_website/docs/integrations/add_ai_assistant.md
@@ -0,0 +1,47 @@
+---
+id: add_ai_assistant
+title: AI Assistant
+sidebar_label: AI Assistant
+---
+
+:::info
+Please check the [user guide](../user_guide/ai_assistant.md) of how the AI assistant features look like.
+:::
+
+The AI assistant plugin is powered by LLM(Large Language Model), like ChatGPT from openai. We're using [Langchain](https://python.langchain.com/docs/get_started/introduction) to build the plugin.
+
+## AI Asssitant Plugin
+
+The AI Assistant plugin will allow users to do title generation, text to sql and query auto fix.
+
+Please follow below steps to enable AI assistant plugin:
+
+1. [Optional] Create your own AI assistant provider if needed. Please refer to `querybook/server/lib/ai_assistant/openai_assistant.py` as an example.
+
+2. Add your provider in `plugins/ai_assistant_plugin/__init__.py`
+
+3. Add configs in the `querybook_config.yaml`. Please refer to `containers/bundled_querybook_config.yaml` as an example. Please also check the model's official doc for all avaialbe model args.
+
+    - Dont forget to set proper environment variables for your provider. e.g. for openai, you'll need `OPENAI_API_KEY`.
+
+4. Enable it in `querybook/config/querybook_public_config.yaml`
+
+## Vector Store Plugin
+
+The vector store plugin supports embedding based table search using natural language. It requires an embeddings provider and a vector store. Please check Langchain docs for more details of available [embeddings](https://python.langchain.com/docs/integrations/text_embedding/) and [vector stores](https://python.langchain.com/docs/integrations/vectorstores/).
+
+:::note
+How to set up and host a vector store or use a cloud vector store solution is not covered here. You can choose your own vector db solution.
+:::
+
+1. [Optional] Create your own embeddings or vector store if needed. Please refer to `querybook/server/lib/vector_store/stores/opensearch.py` as an example
+
+2. Add the providers in `plugins/vector_store_plugin/__init__.py`
+
+3. Add configs in the `querybook_config.yaml`. Please refer to `containers/bundled_querybook_config.yaml` as an example. Please also check Langchain doc for configs each vector store requires.
+
+    - Also dont forget to set proper environment variables for your provider. e.g. for openai embeddings, you'll need `OPENAI_API_KEY`.
+
+4. Enable it in `querybook/config/querybook_public_config.yaml`
+
+With vector store plugin enabled, text-to-sql will also use it to find tables if tables are not provided by the user.
diff --git a/docs_website/docs/integrations/plugins.md b/docs_website/docs/integrations/plugins.md
@@ -72,11 +72,13 @@ DAG Exporters allow users to create a workflow from Query Cells in DataDocs. Que
 
 The API plugin enables you to create new or modify existing API endpoints. For examples of its usage, please refer to the `querybook/plugins/api_plugin` folder.
 
-
 ### Monkey Patch Plugin
 
 Similar to the API plugin, the monkey patch plugin allows you to override or modify existing modules or functions. To see examples of how it works, please check the `querybook/plugins/monkey_patch_plugin` folder.
 
+### AI Assistant Plugin
+
+The AI assistant plugin will provide features like title generation, text-to-sql and query auto fix, powered by LLM. Please check [AI Assistant Guide](./add_ai_assistant.md) to learn how to enable it.
 
 ## Installing Plugins
 

diff --git a/docs_website/docs/user_guide/ai_assistant.md b/docs_website/docs/user_guide/ai_assistant.md
@@ -0,0 +1,44 @@
+---
+id: ai_assistant
+title: AI Assistant
+sidebar_label: AI Assistant
+---
+
+If the [AI Assistant plugin](../integrations/add_ai_assistant.md) is enabled, you'll be able to use below AI features powered by LLM.
+
+## Title Generation
+
+Click the '#' icon will generate title of the query cell automatically.
+![](/img/user_guide/title_generation.gif)
+
+## Text To SQL
+
+Hover over to the left side of the query cell and click the star-like icon will open the text-to-sql modal.
+
+### Query Generation
+
+To use it: select the table(s) you are going to query against and type your question prompt and hit Enter.
+
+If you're unsure which table to use, you can also type your question directly and AI will try to find the table(s) for you.
+
+![](/img/user_guide/text_to_sql.gif)
+
+### Query Editing
+
+If you would like to modify the generated query, you can keep the query and type in the prompt to edit it.
+
+If the query cell already has a query, open the text-to-sql modal will automatically go to the edit mode.
+
+![](/img/user_guide/text_to_sql_edit.gif)
+
+## SQL Fix
+
+If your query failed, you will see ‘Auto fix’ button on the right corner of the error message
+
+![](/img/user_guide/sql_fix.gif)
+
+## Search Table by Natural Language
+
+If [vector store](../integrations/add_ai_assistant.md#vector-store) of the AI assistant plugin is also enabled, you'll be able to search the tables by natual language as well as keyword based search.
+
+![](/img/user_guide/table_vector_search.png)
diff --git a/docs_website/sidebars.json b/docs_website/sidebars.json
@@ -38,6 +38,7 @@
             "integrations/add_table_upload",
             "integrations/add_event_logger",
             "integrations/add_stats_logger",
+            "integrations/add_ai_assistant",
             "integrations/customize_html",
             "integrations/embedded_iframe"
         ],
@@ -50,7 +51,11 @@
             "developer_guide/users_and_groups"
         ],
 
-        "User Guide": ["user_guide/api_token", "user_guide/faq"],
+        "User Guide": [
+            "user_guide/ai_assistant",
+            "user_guide/api_token",
+            "user_guide/faq"
+        ],
         "Changelog": [
             "changelog/breaking_changes",
             "changelog/security_advisories",

diff --git a/docs_website/static/img/user_guide/sql_fix.gif b/docs_website/static/img/user_guide/sql_fix.gif
diff --git a/docs_website/static/img/user_guide/table_vector_search.png b/docs_website/static/img/user_guide/table_vector_search.png
diff --git a/docs_website/static/img/user_guide/text_to_sql.gif b/docs_website/static/img/user_guide/text_to_sql.gif
diff --git a/docs_website/static/img/user_guide/text_to_sql_edit.gif b/docs_website/static/img/user_guide/text_to_sql_edit.gif
diff --git a/docs_website/static/img/user_guide/title_generation.gif b/docs_website/static/img/user_guide/title_generation.gif
diff --git a/plugins/vector_store_plugin/__init__.py b/plugins/vector_store_plugin/__init__.py
@@ -0,0 +1,10 @@
+ALL_PLUGIN_VECTOR_STORES = {}
+ALL_PLUGIN_EMBEDDINGS = {}
+
+# Example to add vector store
+
+# from lib.vector_store.stores.opensearch import OpenSearchVectorStore
+# from langchain.embeddings import OpenAIEmbeddings
+
+# ALL_PLUGIN_VECTOR_STORES = {"opensearch": OpenSearchVectorStore}
+# ALL_PLUGIN_EMBEDDINGS = {"openai": OpenAIEmbeddings}
diff --git a/querybook/config/querybook_default_config.yaml b/querybook/config/querybook_default_config.yaml
@@ -86,11 +86,21 @@ EVENT_LOGGER_NAME: ~
 STATS_LOGGER_NAME: ~
 
 # --------------- AI Assistant ---------------
-# Example config for OpenAI
-# AI_ASSISTANT_PROVIDER: openai
-# AI_ASSISTANT_CONFIG:
-#     model_name: gpt-3.5-turbo
-#     temperature: 0
 AI_ASSISTANT_PROVIDER: ~
 AI_ASSISTANT_CONFIG:
-    model_name: ~
+    default:
+        model_args:
+            model_name: ~
+            temperature: ~
+        context_length: ~
+        reserved_tokens: ~
+    table_select:
+        fetch_k: ~
+        top_n: ~
+
+EMBEDDINGS_PROVIDER: ~
+EMBEDDINGS_CONFIG: ~
+VECTOR_STORE_PROVIDER: ~
+VECTOR_STORE_CONFIG:
+    embeddings_arg_name: 'embedding_function'
+    index_name: 'vector_index_v1'
diff --git a/querybook/config/querybook_public_config.yaml b/querybook/config/querybook_public_config.yaml
@@ -10,3 +10,6 @@ ai_assistant:
 
     query_auto_fix:
         enabled: true
+
+    table_vector_search:
+        enabled: false
diff --git a/querybook/server/const/ai_assistant.py b/querybook/server/const/ai_assistant.py
@@ -0,0 +1,30 @@
+from enum import Enum
+
+
+# KEEP IT CONSISTENT AS webapp/const/aiAssistant.ts
+class AICommandType(Enum):
+    SQL_FIX = "sql_fix"
+    SQL_TITLE = "sql_title"
+    TEXT_TO_SQL = "text_to_sql"
+    SQL_SUMMARY = "sql_summary"
+    TABLE_SUMMARY = "table_summary"
+    TABLE_SELECT = "table_select"
+
+
+AI_ASSISTANT_NAMESPACE = "/ai_assistant"
+
+
+DEFAULT_SAMPLE_QUERY_COUNT = 50
+MAX_SAMPLE_QUERY_COUNT_FOR_TABLE_SUMMARY = 5
+
+
+# the minimum score for a table to be considered as a match
+DEFAULT_SIMILARITY_SCORE_THRESHOLD = 0.6
+# the minimum score for a table to be considered as a great match
+DEFAULT_SIMILARITY_SCORE_THRESHOLD_GREAT_MATCH = 0.7
+# how many docs to fetch from vector store, it may include both table and query summary docs and they need additional processing.
+DEFAULT_VECTOR_STORE_FETCH_LIMIT = 30
+# how many tables to return from vector table search eventually
+DEFAUTL_TABLE_SEARCH_LIMIT = 10
+# how many tables to select for text-to-sql
+DEFAUTL_TABLE_SELECT_LIMIT = 3
diff --git a/querybook/server/datasources/__init__.py b/querybook/server/datasources/__init__.py
@@ -16,7 +16,6 @@
 from . import event_log
 from . import data_element
 from . import comment
-from . import ai_assistant
 
 # Keep this at the end of imports to make sure the plugin APIs override the default ones
 try:
@@ -43,5 +42,4 @@
 event_log
 data_element
 comment
-ai_assistant
 api_plugin
diff --git a/querybook/server/datasources/ai_assistant.py b/querybook/server/datasources/ai_assistant.py
diff --git a/querybook/server/datasources/search.py b/querybook/server/datasources/search.py
@@ -16,7 +16,8 @@
 )
 from lib.elasticsearch.suggest_table import construct_suggest_table_query
 from lib.elasticsearch.suggest_user import construct_suggest_user_query
-from logic.elasticsearch import ES_CONFIG
+from lib.elasticsearch.search_utils import ES_CONFIG
+from logic import vector_store as vs_logic
 
 LOG = get_logger(__file__)
 
@@ -112,9 +113,20 @@ def search_tables(
     results, count = get_matching_objects(
         query, ES_CONFIG["tables"]["index_name"], True
     )
+
     return {"count": count, "results": results}
 
 
+@register("/search/tables/vector/", methods=["GET"])
+def vector_search_tables(
+    metastore_id,
+    keywords,
+    filters=None,
+):
+    verify_metastore_permission(metastore_id)
+    return vs_logic.search_tables(metastore_id, keywords, filters)
+
+
 @register("/suggest/<int:metastore_id>/tables/", methods=["GET"])
 def suggest_tables(metastore_id, prefix, limit=10):
     api_assert(limit is None or limit <= 100, "Requesting too many tables")

diff --git a/querybook/server/datasources_socketio/__init__.py b/querybook/server/datasources_socketio/__init__.py
@@ -1,7 +1,9 @@
 from . import query_execution
 from . import datadoc
 from . import connect
+from . import ai_assistant
 
 connect
 query_execution
 datadoc
+ai_assistant