diff --git a/.env b/.env new file mode 100644 index 000000000..b53a64a78 --- /dev/null +++ b/.env @@ -0,0 +1,2 @@ +HUGGINGFACEHUB_API_TOKEN=hf_xxxx +OPENAI_API_KEY=sk-xxxx diff --git a/notebooks/text/imdb_search_milvus_client.ipynb b/notebooks/text/imdb_search_milvus_client.ipynb new file mode 100755 index 000000000..45e13f38f --- /dev/null +++ b/notebooks/text/imdb_search_milvus_client.ipynb @@ -0,0 +1,1124 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "369c3444", + "metadata": {}, + "source": [ + "# IMDB Vector Search using Milvus Client" + ] + }, + { + "cell_type": "markdown", + "id": "f6ffd11a", + "metadata": {}, + "source": [ + "First, import some common libraries and define the data reading functions." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "d7570b2e", + "metadata": {}, + "outputs": [], + "source": [ + "# For colab install these libraries in this order:\n", + "# !pip install milvus, pymilvus, langchain, torch, transformers, python-dotenv\n", + "\n", + "# Import common libraries.\n", + "import time\n", + "import pandas as pd\n", + "import numpy as np\n", + "\n", + "# Import custom functions for splitting and search.\n", + "import imdb_utilities" + ] + }, + { + "cell_type": "markdown", + "id": "8a67e382", + "metadata": {}, + "source": [ + "## Start up a local Milvus server." + ] + }, + { + "cell_type": "markdown", + "id": "fb844837", + "metadata": {}, + "source": [ + "Code in this notebook uses [Milvus client](https://milvus.io/docs/using_milvusclient.md) with [Milvus lite](https://milvus.io/docs/milvus_lite.md), which runs a local server. ⛔️ Milvus lite is only meant for demos and local testing.\n", + "- pip install milvus pymilvus\n", + "\n", + "💡 **For production purposes**, use a local Milvus docker, Milvus clusters, or fully-managed Milvus on Zilliz Cloud.\n", + "- [Local Milvus docker](https://milvus.io/docs/install_standalone-docker.md) requires local docker installed and running.\n", + "- [Milvus clusters](https://milvus.io/docs/install_cluster-milvusoperator.md) requires a K8s cluster up and running.\n", + "- [Ziliz Cloud free trial](https://cloud.zilliz.com/login) choose a \"free\" option when you provision.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "0806d2db", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Milvus server startup time: 7.5935890674591064 sec\n", + "v2.2-testing-20230824-68-ga34a9d606-lite\n" + ] + } + ], + "source": [ + "from milvus import default_server\n", + "from pymilvus import (\n", + " connections, utility, \n", + " MilvusClient,\n", + ")\n", + "\n", + "# Cleanup previous data and stop server in case it is still running.\n", + "default_server.stop()\n", + "default_server.cleanup()\n", + "\n", + "# Start a new milvus-lite local server.\n", + "start_time = time.time()\n", + "default_server.start()\n", + "\n", + "end_time = time.time()\n", + "print(f\"Milvus server startup time: {end_time - start_time} sec\")\n", + "# startup time: 5.6739208698272705\n", + "\n", + "# Add wait to avoid error message from trying to connect.\n", + "time.sleep(15)\n", + "\n", + "# Now you could connect with localhost and the given port.\n", + "# Port is defined by default_server.listen_port.\n", + "connections.connect(host='127.0.0.1', \n", + " port=default_server.listen_port,\n", + " show_startup_banner=True)\n", + "\n", + "# Check if the server is ready.\n", + "print(utility.get_server_version())" + ] + }, + { + "cell_type": "markdown", + "id": "b01d6622", + "metadata": {}, + "source": [ + "## Load the Embedding Model checkpoint and use it to create vector embeddings\n", + "**Embedding model:** We will use the open-source [sentence transformers](https://www.sbert.net/docs/pretrained_models.html) hosted on HuggingFace to encode the movie review text. We will save the embeddings to a pandas dataframe and then into the milvus database.\n", + "\n", + "💡 Note: To keep your tokens private, best practice is to use an env variable.
\n", + "In Jupyter, need .env file (in same dir as notebooks) containing lines like this:\n", + "- VARIABLE_NAME=value" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "dd2be7fd", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "device: cpu\n", + "Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.\n", + "Token is valid (permission: write).\n", + "Your token has been saved to /Users/christybergman/.cache/huggingface/token\n", + "Login successful\n", + "\n", + "SentenceTransformer(\n", + " (0): Transformer({'max_seq_length': 512, 'do_lower_case': True}) with Transformer model: BertModel \n", + " (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': True, 'pooling_mode_mean_tokens': False, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})\n", + ")\n", + "model_name: BAAI/bge-base-en-v1.5\n", + "EMBEDDING_LENGTH: 768\n", + "MAX_SEQ_LENGTH: 512\n" + ] + } + ], + "source": [ + "# Import torch.\n", + "import torch\n", + "from torch.nn import functional as F\n", + "from sentence_transformers import SentenceTransformer\n", + "\n", + "# Initialize torch settings\n", + "torch.backends.cudnn.deterministic = True\n", + "RANDOM_SEED = 415\n", + "torch.manual_seed(RANDOM_SEED)\n", + "DEVICE = torch.device('cuda:3' if torch.cuda.is_available() else 'cpu')\n", + "print(f\"device: {DEVICE}\")\n", + "\n", + "import os\n", + "from dotenv import load_dotenv, find_dotenv\n", + "_ = load_dotenv(find_dotenv())\n", + "from huggingface_hub import login\n", + "\n", + "# Login to huggingface_hub\n", + "hub_token = os.getenv(\"HUGGINGFACEHUB_API_TOKEN\")\n", + "login(token=hub_token)\n", + "\n", + "# Load the model from huggingface model hub.\n", + "model_name = \"BAAI/bge-base-en-v1.5\"\n", + "retriever = SentenceTransformer(model_name, device=DEVICE)\n", + "print(type(retriever))\n", + "print(retriever)\n", + "\n", + "# Get the model parameters and save for later.\n", + "MAX_SEQ_LENGTH = retriever.get_max_seq_length() \n", + "HF_EOS_TOKEN_LENGTH = 1\n", + "EMBEDDING_LENGTH = retriever.get_sentence_embedding_dimension()\n", + "\n", + "# Inspect model parameters.\n", + "print(f\"model_name: {model_name}\")\n", + "print(f\"EMBEDDING_LENGTH: {EMBEDDING_LENGTH}\")\n", + "print(f\"MAX_SEQ_LENGTH: {MAX_SEQ_LENGTH}\")" + ] + }, + { + "cell_type": "markdown", + "id": "d2b12728", + "metadata": {}, + "source": [ + "## Create a Milvus collection\n", + "\n", + "You can think of a collection in Milvus like a \"table\" in SQL databases. The **collection** will contain the \n", + "- **Schema** (or no-schema Milvus Client). \n", + "💡 You'll need the vector `EMBEDDING_LENGTH` parameter from your embedding model.\n", + "- **Vector index** for efficient vector search\n", + "- **Vector distance metric** for measuring nearest neighbor vectors\n", + "- **Consistency level**\n", + "In Milvus, transactional consistency is possible; however, according to the [CAP theorem](https://en.wikipedia.org/wiki/CAP_theorem), some latency must be sacrificed. 💡 Searching movie reviews is not mission-critical, so [`eventually`](https://milvus.io/docs/consistency.md) consistent is fine here.\n", + "\n", + "## Add a Vector Index\n", + "\n", + "The vector index determines the vector **search algorithm** used to find the closest vectors in your data to the query a user submits. Most vector indexes use different sets of parameters depending on whether the database is:\n", + "- **inserting vectors** (creation mode) - vs - \n", + "- **searching vectors** (search mode) \n", + "\n", + "Scroll down the [docs page](https://milvus.io/docs/index.md) to see a table listing different vector indexes available on Milvus. For example:\n", + "- FLAT - deterministic exhaustive search\n", + "- IVF_FLAT or IVF_SQ8 - Hash index (stochastic approximate search)\n", + "- HNSW - Graph index (stochastic approximate search)\n", + "\n", + "Besides a search algorithm, we also need to specify a **distance metric**, that is, a definition of what is considered \"close\" in vector space. In the cell below, the [`HNSW`](https://github.com/nmslib/hnswlib/blob/master/ALGO_PARAMS.md) search index is chosen. Its possible distance metrics are one of:\n", + "- L2 - L2-norm\n", + "- IP - Dot-product\n", + "- COSINE - Angular distance" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "4a85b295", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Embedding length: 768\n", + "{'collection_name': 'movies', 'auto_id': True, 'num_shards': 1, 'description': '', 'fields': [{'field_id': 100, 'name': 'id', 'description': '', 'type': 5, 'params': {}, 'auto_id': True, 'is_primary': True}, {'field_id': 101, 'name': 'vector', 'description': '', 'type': 101, 'params': {'dim': 768}}], 'aliases': [], 'collection_id': 445129100245860457, 'consistency_level': 3, 'properties': [], 'num_partitions': 1, 'enable_dynamic_field': True}\n", + "Created collection: movies\n" + ] + } + ], + "source": [ + "# Use embedding length from the embedding model.\n", + "print(f\"Embedding length: {EMBEDDING_LENGTH}\")\n", + "\n", + "# Set the Milvus collection name.\n", + "COLLECTION_NAME = \"movies\"\n", + "\n", + "# # Note: Default AUTOINDEX works on both Milvus and Zilliz Cloud (where it is the fastest!)\n", + "# index_params = {\n", + "# # Always set this to AUTOINDEX or just omit it.\n", + "# \"index_type\": \"AUTOINDEX\", \n", + "# # Defaults to IP (Inner Product) if omitted.\n", + "# \"metric_type\": \"COSINE\",\n", + "# }\n", + "\n", + "# # Show how to change the index parameters, instead of using defaults.\n", + "# Define vector index algorithm params.\n", + "INDEX_PARAMS = dict({\n", + " 'M': 16, # int. 4~64, num_neighbors, higher values takes more memory.\n", + " \"efConstruction\": 32} # int. 8~512, num_candidate_nearest_neighbors\n", + " )\n", + "# Create the search index for local Milvus server.\n", + "index_params = {\n", + " \"index_type\": \"HNSW\", \n", + " \"metric_type\": \"COSINE\", \n", + " \"params\": INDEX_PARAMS\n", + " }\n", + "\n", + "# Use no-schema Milvus client (uses flexible json key:value format).\n", + "# https://milvus.io/docs/using_milvusclient.md\n", + "mc = MilvusClient(uri=\"http://localhost\")\n", + "mc.drop_collection(COLLECTION_NAME)\n", + "mc.create_collection(COLLECTION_NAME, EMBEDDING_LENGTH, \n", + " consistency_level=\"Eventually\", \n", + " auto_id=True,\n", + " overwrite=True,\n", + " params=index_params # Use custom index params or omit.\n", + " )\n", + "\n", + "# print(\"List collections:\", mc.list_collections())\n", + "print(mc.describe_collection(COLLECTION_NAME))\n", + "print(f\"Created collection: {COLLECTION_NAME}\")" + ] + }, + { + "cell_type": "markdown", + "id": "e735fe08", + "metadata": {}, + "source": [ + "## Read CSV data into a pandas dataframe\n", + "\n", + "The data used in this notebook is the [IMDB large movie review dataset](https://ai.stanford.edu/~amaas/data/sentiment/) from the Stanford AI Lab. It is a conveniently processed 50,000 dataset (50:50 sampled ratio Positive/Negative reviews). This data has columns: movie_index, raw review text, and movie rating." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "6861beb7", + "metadata": {}, + "outputs": [], + "source": [ + "# Download data.\n", + "# https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz\n", + "# citation: ACL 2011, @InProceedings{maas-EtAl:2011:ACL-HLT2011,\n", + "# author = {Maas, Andrew L. and Daly, Raymond E. and Pham, Peter T. and Huang, Dan and Ng, Andrew Y. and Potts, Christopher},\n", + "# title = {Learning Word Vectors for Sentiment Analysis},\n", + "# booktitle = {Proceedings of the 49th Annual Meeting of the Association for Computational Linguistics: Human Language Technologies},\n", + "# month = {June},\n", + "# year = {2011},\n", + "# address = {Portland, Oregon, USA},\n", + "# publisher = {Association for Computational Linguistics},\n", + "# pages = {142--150},\n", + "# url = {http://www.aclweb.org/anthology/P11-1015}\n", + "# }\n", + "\n", + "# Cleanup: move data file to data/ folder." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "6a381e57", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "original df shape: (49582, 4)\n", + "df_train shape: (35000, 4), df_val shape: (5000, 4), df_test shape: (9582, 4)\n", + "Example text length: 677\n", + "Example text: Fot the most part, this movie feels like a \"made-for-TV\" effort. The direction is ham-fisted, the acting (with the exception of Fred Gwynne) is overwrought and soapy. Denise Crosby, particularly, delivers her lines like she's cold reading them off a cue card. Only one thing makes this film worth watching, and that is once Gage comes back from the \"Semetary.\" There is something disturbing about watching a small child murder someone, and this movie might be more than some can handle just for that reason. It is absolutely bone-chilling. This film only does one thing right, but it knocks that one thing right out of the park. Worth seeing just for the last 10 minutes or so.\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
movie_indextextlabel_intlabel
026813Fot the most part, this movie feels like a \"ma...0Negative
126581Are you kidding me? The music was SO LOUD in t...0Negative
\n", + "
" + ], + "text/plain": [ + " movie_index text label_int \\\n", + "0 26813 Fot the most part, this movie feels like a \"ma... 0 \n", + "1 26581 Are you kidding me? The music was SO LOUD in t... 0 \n", + "\n", + " label \n", + "0 Negative \n", + "1 Negative " + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Read locally stored data.\n", + "filepath = \"data/movie_data.csv\"\n", + "\n", + "df = pd.read_csv(f\"{filepath}\")\n", + "\n", + "# Drop duplicates\n", + "df.drop_duplicates(keep='first', inplace=True)\n", + "\n", + "# Change label column names.\n", + "df.columns = ['text', 'label_int']\n", + "\n", + "# Map numbers to text 'Postive' and 'Negative' for sentiment labels.\n", + "df[\"label\"] = df[\"label_int\"].apply(imdb_utilities.sentiment_score_to_name)\n", + "\n", + "# Split data into train/valid/test.\n", + "df, df_train, df_val, df_test = imdb_utilities.partition_dataset(df, smoke_test=False)\n", + "print(f\"original df shape: {df.shape}\")\n", + "print(f\"df_train shape: {df_train.shape}, df_val shape: {df_val.shape}, df_test shape: {df_test.shape}\")\n", + "assert df_train.shape[0] + df_val.shape[0] + df_test.shape[0] == df.shape[0]\n", + "\n", + "# Inspect data.\n", + "print(f\"Example text length: {len(df.text[0])}\")\n", + "print(f\"Example text: {df.text[0]}\")\n", + "display(df.head(2))\n" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "654dd135", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Count samples positive: 17606\n", + "Count samples negative: 17394\n" + ] + } + ], + "source": [ + "# Check if approx. equal number training examples for each class.\n", + "class1 = df_train.loc[(df_train.label == \"Positive\"), :].copy()\n", + "class2 = df_train.loc[(df_train.label == \"Negative\"), :].copy()\n", + "print(f\"Count samples positive: {class1.shape[0]}\")\n", + "print(f\"Count samples negative: {class2.shape[0]}\")" + ] + }, + { + "cell_type": "markdown", + "id": "c60423a5", + "metadata": {}, + "source": [ + "## Chunking\n", + "\n", + "Before embedding, it is necessary to decide your chunk strategy, chunk size, and chunk overlap. In this demo, I will use:\n", + "- **Strategy** = Keep movie reveiws as single chunks unless they are too long.\n", + "- **Chunk size** = The embedding model's parameter `MAX_SEQ_LENGTH`\n", + "- **Overlap** = Rule-of-thumb 10-15%\n", + "- **Function** = Langchain's convenient `RecursiveCharacterTextSplitter` to split up long reviews recursively.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "a53595fa", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "chunk size: 511\n" + ] + } + ], + "source": [ + "from langchain.text_splitter import RecursiveCharacterTextSplitter\n", + "\n", + "# Default chunk_size and overlap are calculated from embedding model parameters.\n", + "chunk_size = MAX_SEQ_LENGTH - HF_EOS_TOKEN_LENGTH\n", + "print(f\"chunk size: {chunk_size}\")\n", + "\n", + "def chunk_text(text, chunk_size):\n", + "\n", + " # Default chunk overlap is 10% chunk_size.\n", + " chunk_overlap = np.round(chunk_size * 0.10, 0)\n", + "\n", + " # Use langchain's convenient recursive chunking method.\n", + " text_splitter = RecursiveCharacterTextSplitter(\n", + " chunk_size=chunk_size,\n", + " chunk_overlap=chunk_overlap,\n", + " length_function=len,\n", + " )\n", + " \n", + " chunks = text_splitter.split_text(text)\n", + " return [chunk for chunk in chunks if chunk]" + ] + }, + { + "cell_type": "markdown", + "id": "249e9c74", + "metadata": {}, + "source": [ + "**Demo batch size = 100 rows for demonstration purposes.**\n", + "\n", + "This means the question results could be better with more data." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "0f915aed", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "original shape: (100, 4)\n", + "chunk_size: 511\n", + "new shape: (290, 5)\n", + "Chunking + embedding time for 100 docs: 9.208609819412231 sec\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
movie_indextextchunkvectorlabel_intlabel
026813Fot the most part, this movie feels like a \"ma...Fot the most part, this movie feels like a \"ma...[-0.022307869, -0.038372956, -0.005369567, -0....0Negative
126813Fot the most part, this movie feels like a \"ma...more than some can handle just for that reason...[0.02946616, -0.024044147, -0.011064137, -0.03...0Negative
226581Are you kidding me? The music was SO LOUD in t...Are you kidding me? The music was SO LOUD in t...[-0.016822321, -0.030674767, -0.041740056, 0.0...0Negative
326581Are you kidding me? The music was SO LOUD in t...And what does a Kansas teen know about shoppin...[0.035922922, -0.06197654, 0.008055181, -0.025...0Negative
440633First of all, I don't understand why some peop...First of all, I don't understand why some peop...[-0.0035528215, -0.042889904, -0.04559665, 0.0...1Positive
\n", + "
" + ], + "text/plain": [ + " movie_index text \\\n", + "0 26813 Fot the most part, this movie feels like a \"ma... \n", + "1 26813 Fot the most part, this movie feels like a \"ma... \n", + "2 26581 Are you kidding me? The music was SO LOUD in t... \n", + "3 26581 Are you kidding me? The music was SO LOUD in t... \n", + "4 40633 First of all, I don't understand why some peop... \n", + "\n", + " chunk \\\n", + "0 Fot the most part, this movie feels like a \"ma... \n", + "1 more than some can handle just for that reason... \n", + "2 Are you kidding me? The music was SO LOUD in t... \n", + "3 And what does a Kansas teen know about shoppin... \n", + "4 First of all, I don't understand why some peop... \n", + "\n", + " vector label_int label \n", + "0 [-0.022307869, -0.038372956, -0.005369567, -0.... 0 Negative \n", + "1 [0.02946616, -0.024044147, -0.011064137, -0.03... 0 Negative \n", + "2 [-0.016822321, -0.030674767, -0.041740056, 0.0... 0 Negative \n", + "3 [0.035922922, -0.06197654, 0.008055181, -0.025... 0 Negative \n", + "4 [-0.0035528215, -0.042889904, -0.04559665, 0.0... 1 Positive " + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "type embeddings: of \n", + "of numbers: \n" + ] + } + ], + "source": [ + "# Prepare df for insertion into Milvus index.\n", + "\n", + "# Batch of data from pandas DataFrame.\n", + "BATCH_SIZE = 100\n", + "batch = df.head(BATCH_SIZE).copy()\n", + "print(f\"original shape: {batch.shape}\")\n", + "start_time = time.time()\n", + "\n", + "# 1. Change primary key type to string.\n", + "batch[\"movie_index\"] = batch[\"movie_index\"].apply(lambda x: str(x))\n", + "\n", + "# 2. Truncate reviews to 512 characters.\n", + "# Naive approach, just truncate to 512 characters.\n", + "# batch[\"text\"] = batch[\"text\"].apply(lambda x: x[:MAX_SEQ_LENGTH - HF_EOS_TOKEN_LENGTH])\n", + "\n", + "# Better approach, use LangChain's utility function that adds chunk overlaps.\n", + "\n", + "###############\n", + "## EXERCISE #1: Change NEW_CHUNK_SIZE to 256 below. How many chunks (vectors) does this create?\n", + "## ANSWER: 542\n", + "## BONUS: Can you explain why the # vectors changed from 290 to 542? \n", + "## Hint: What is the default chunk overlap?\n", + "###############\n", + "NEW_CHUNK_SIZE = None \n", + "if NEW_CHUNK_SIZE is not None:\n", + " chunk_size = NEW_CHUNK_SIZE\n", + "print(f\"chunk_size: {chunk_size}\")\n", + "## END EXERCISE #1\n", + "\n", + "batch['chunk'] = batch['text'].apply(chunk_text, chunk_size=chunk_size)\n", + "# Explode the 'chunk' column to create new rows for each chunk.\n", + "batch = batch.explode('chunk', ignore_index=True)\n", + "print(f\"new shape: {batch.shape}\")\n", + "\n", + "# 3. Add embeddings as new column in df.\n", + "review_embeddings = torch.tensor(retriever.encode(batch['chunk']))\n", + "# Normalize embeddings to unit length.\n", + "review_embeddings = F.normalize(review_embeddings, p=2, dim=1)\n", + "# Quick check if embeddings are normalized.\n", + "norms = np.linalg.norm(review_embeddings, axis=1)\n", + "assert np.allclose(norms, 1.0, atol=1e-5) == True\n", + "\n", + "# 4. Convert embeddings to list of `numpy.ndarray`, each containing `numpy.float32` numbers.\n", + "converted_values = list(map(np.float32, review_embeddings))\n", + "batch['vector'] = converted_values\n", + "\n", + "# 5. Reorder columns for conveneince, so index first, labels at end.\n", + "new_order = [\"movie_index\", \"text\", \"chunk\", \"vector\", \"label_int\", \"label\"]\n", + "batch = batch[new_order]\n", + "\n", + "end_time = time.time()\n", + "print(f\"Chunking + embedding time for {BATCH_SIZE} docs: {end_time - start_time} sec\")\n", + "\n", + "# Inspect data.\n", + "display(batch.head())\n", + "assert len(batch.chunk[0]) <= MAX_SEQ_LENGTH-1\n", + "assert len(batch.vector[0]) == EMBEDDING_LENGTH\n", + "print(f\"type embeddings: {type(batch.vector)} of {type(batch.vector[0])}\")\n", + "print(f\"of numbers: {type(batch.vector[0][0])}\")\n", + "\n", + "# Chunking looks good, drop the original text column.\n", + "batch.drop(columns=[\"text\"], inplace=True)" + ] + }, + { + "cell_type": "markdown", + "id": "d9bd8153", + "metadata": {}, + "source": [ + "## Insert data into Milvus\n", + "\n", + "We can insert a batch of data directly from a pandas dataframe into Milvus.\n", + "\n", + "🤔 TODO: This would be a good place to demonstrate Milvus' scalability by using Ray together with Milvus to run batches in parallel. I'll do this in a future tutorial." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "b51ff139", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Start inserting entities\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 1/1 [00:00<00:00, 32.24it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Milvus insert time for 290 vectors: 0.03276681900024414 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n" + ] + } + ], + "source": [ + "# Insert a batch of data into the Milvus collection.\n", + "\n", + "# Convert DataFrame to a list of dictionaries\n", + "dict_list = []\n", + "for _, row in batch.iterrows():\n", + " dictionary = row.to_dict()\n", + " dict_list.append(dictionary)\n", + "\n", + "print(\"Start inserting entities\")\n", + "start_time = time.time()\n", + "insert_result = mc.insert(\n", + " COLLECTION_NAME,\n", + " data=dict_list, \n", + " progress_bar=True)\n", + "end_time = time.time()\n", + "print(f\"Milvus insert time for {batch.shape[0]} vectors: {end_time - start_time} seconds\")\n", + "\n", + "# After final entity is inserted, call flush to stop growing segments left in memory.\n", + "mc.flush(COLLECTION_NAME)\n" + ] + }, + { + "cell_type": "markdown", + "id": "4ebfb115", + "metadata": {}, + "source": [ + "## Run a Semantic Search\n", + "\n", + "Now we can search all the movie review embeddings to find the `TOP_K` movie reviews with the closest embeddings to a user's query.\n", + "- In this example, we'll search for a movie recommendation for a medical doctor.\n", + "\n", + "💡 The same model should always be used for consistency for all the embeddings." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "eb7bc132", + "metadata": {}, + "outputs": [], + "source": [ + "# .load() not needed when using no-schema Milvus client.\n", + "\n", + "# # Before conducting a search based on a query, you need to load the data into memory.\n", + "# mc.load()\n", + "# print(\"Loaded milvus collection into memory.\")" + ] + }, + { + "cell_type": "markdown", + "id": "02c589ff", + "metadata": {}, + "source": [ + "## Ask a question about your data\n", + "\n", + "So far in this demo notebook: \n", + "1. Your custom data has been mapped into a vector embedding space\n", + "2. Those vector embeddings have been saved into a vector database\n", + "\n", + "Next, you can ask a question about your custom data!\n", + "\n", + "💡 In LLM lingo:\n", + "> **Query** is the generic term for user questions. \n", + "A query is a list of multiple individual questions, up to maybe 1000 different questions!\n", + "\n", + "> **Question** usually refers to a single user question. \n", + "In our example below, the user question is \"I'm a medical doctor, what movie should I watch?\"" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "5e7f41f4", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "query length: 48\n" + ] + } + ], + "source": [ + "# Define a sample question about your data.\n", + "question = \"I'm a medical doctor, what movie should I watch?\"\n", + "query = [question]\n", + "\n", + "# Inspect the length of the query.\n", + "QUERY_LENGTH = len(query[0])\n", + "print(f\"query length: {QUERY_LENGTH}\")" + ] + }, + { + "cell_type": "markdown", + "id": "fa545611", + "metadata": {}, + "source": [ + "**Embed the question using the same embedding model you used earlier**\n", + "\n", + "In order for vector search to work, the question itself should be embedded with the same model used to create the colleciton you want to search." + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "a6863a32", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " 1 \n", + "\n" + ] + } + ], + "source": [ + "# Embed the query using same embedding model used to create the Milvus collection.\n", + "query_embeddings = torch.tensor(retriever.encode(query))\n", + "# Normalize embeddings to unit length.\n", + "query_embeddings = F.normalize(query_embeddings, p=2, dim=1)\n", + "# Quick check if embeddings are normalized.\n", + "norms = np.linalg.norm(query_embeddings, axis=1)\n", + "assert np.allclose(norms, 1.0, atol=1e-5) == True\n", + "\n", + "# Convert the embeddings to list of list of np.float32.\n", + "query_embeddings = list(map(np.float32, query_embeddings))\n", + "\n", + "# Inspect data.\n", + "print(type(query_embeddings), len(query_embeddings), type(query_embeddings[0]))\n", + "print(type(query_embeddings[0][0]) ) " + ] + }, + { + "cell_type": "markdown", + "id": "9ea29411", + "metadata": {}, + "source": [ + "## Execute a vector search\n", + "\n", + "Search Milvus using [PyMilvus API](https://milvus.io/docs/search.md).\n", + "\n", + "💡 By their nature, vector searches are \"semantic\" searches. For example, if you were to search for \"leaky faucet\": \n", + "> **Traditional Key-word Search** - either or both words \"leaky\", \"faucet\" would have to match some text in order to return a web page or link text to the document.\n", + "\n", + "> **Semantic search** - results containing words \"drippy\" \"taps\" would be returned as well because these words mean the same thing even though they are different words," + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "c5d98e28", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Search time: 0.005642890930175781 sec\n", + "type: , count: 3\n" + ] + } + ], + "source": [ + "# Execute a search.\n", + "\n", + "# Return top k results with HNSW index.\n", + "TOP_K = 3\n", + "SEARCH_PARAMS = dict({\n", + " # Re-use index param for num_candidate_nearest_neighbors.\n", + " \"ef\": INDEX_PARAMS['efConstruction']\n", + " })\n", + "\n", + "# Run semantic vector search using your query and the vector database.\n", + "start_time = time.time()\n", + "results = mc.search(\n", + " COLLECTION_NAME,\n", + " data=query_embeddings, \n", + " search_params=SEARCH_PARAMS,\n", + " output_fields=[\"movie_index\", \"chunk\", \"label\"], \n", + " limit=TOP_K,\n", + " consistency_level=\"Eventually\",\n", + " )\n", + "\n", + "elapsed_time = time.time() - start_time\n", + "print(f\"Search time: {elapsed_time} sec\")\n", + "\n", + "# Inspect search result.\n", + "print(f\"type: {type(results)}, count: {len(results[0])}\")\n" + ] + }, + { + "cell_type": "markdown", + "id": "95f7e011", + "metadata": {}, + "source": [ + "## Assemble and inspect the search result\n", + "\n", + "The search result is in the variable `result[0]` of type `'pymilvus.orm.search.SearchResult'`. " + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "d3dfa33a", + "metadata": {}, + "outputs": [], + "source": [ + "## Results returned from MilvusClient are in the form list of lists of dicts.\n", + "\n", + "# Get the movie_indexes, review texts, and labels.\n", + "distances = []\n", + "texts = []\n", + "movie_indexes = []\n", + "labels = []\n", + "for result in results[0]:\n", + " distances.append(result['distance'])\n", + " texts.append(result['entity']['chunk'])\n", + " movie_indexes.append(result['entity']['movie_index'])\n", + " labels.append(result['entity']['label'])\n", + "\n", + "# Assemble all the results in a zipped list.\n", + "formatted_results = list(zip(distances, movie_indexes, texts, labels))" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "22d65363", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0: 0.541, 931, Negative, Dr. K(David H Hickey)has been trying to master a formula that would end all disease and handicaps, b\n", + "1: 0.54, 20682, Positive, is not a horror movie, although it does contain some violent scenes, but is rather a comedy. A satir\n", + "2: 0.535, 12529, Positive, a good movie with a real good story. The fact that there are so many other big stars who all also ha\n" + ] + } + ], + "source": [ + "# Print the results.\n", + "# k: distance, movie_index, label, review text\n", + "\n", + "i = 0\n", + "for row in formatted_results:\n", + " print(f\"{i}: {np.round(row[0],3)}, {row[1]}, {row[3]}, {row[2][:100]}\")\n", + " i += 1\n", + "\n", + "#1: 2006, Serum, \n", + "# 0: 0.541, 931, Negative, Dr. K(David H Hickey)has been trying to master a formula that would end all disease and handicaps, b\n", + "# 1: 0.54, 20682, Positive, is not a horror movie, although it does contain some violent scenes, but is rather a comedy. A satir\n", + "# 2: 0.535, 12529, Positive, a good movie with a real good story. The fact that there are so many other big stars who\n" + ] + }, + { + "cell_type": "markdown", + "id": "9cf49a96", + "metadata": {}, + "source": [ + "## Try another question\n", + "\n", + "This time just add the words **only good movies** to the question, see if the answers are any different? \n", + "\n", + "For semantically different questions, we expect the answers to be different.\n", + "\n", + "To make the code easier to read, this time I'll just use the convenience function I defined at the top of this notebook." + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "922073f2", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0: 0.562, 45719, Positive, the stories but helps Malkovich to provoke some thought.

I'd say it is worth seeing and t\n", + "1: 0.562, 21791, Positive, to add that the dog (who's a pretty darn good actor himself!) comes in a close second.

Al\n", + "2: 0.561, 12529, Positive, a good movie with a real good story. The fact that there are so many other big stars who all also ha\n" + ] + } + ], + "source": [ + "# Take as input a user question and conduct semantic vector search using the question.\n", + "question = \"I'm a medical doctor, what movie should I watch?\"\n", + "new_question = \"I'm a medical doctor, suggest only good movies to watch?\"\n", + "new_results = \\\n", + " imdb_utilities.mc_search_imdb([new_question],\n", + " retriever,\n", + " mc,\n", + " SEARCH_PARAMS, 3, \n", + " milvus_client=True,\n", + " COLLECTION_NAME=COLLECTION_NAME,\n", + " )\n", + "\n", + "# Print the results.\n", + "# k: distance, movie_index, label, review text\n", + "i = 0\n", + "for row in new_results:\n", + " print(f\"{i}: {np.round(row[0],3)}, {row[1]}, {row[3]}, {row[2][:100]}\")\n", + " i += 1\n", + "\n", + "# As expected, new_question answers are slightly different!\n", + "# 0: 0.562, 45719, Positive, the stories but helps Malkovich to provoke some thought.

I'd say it is worth seeing and t\n", + "# 1: 0.562, 21791, Positive, to add that the dog (who's a pretty darn good actor himself!) comes in a close second.

Al\n", + "# 2: 0.561, 12529, Positive, a good movie with a real good story. The fact that there are so many other big" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "d0e81e68", + "metadata": {}, + "outputs": [], + "source": [ + "# Shut down and cleanup the milvus server.\n", + "default_server.stop()\n", + "default_server.cleanup()" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "c777937e", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Author: Christy Bergman\n", + "\n", + "Python implementation: CPython\n", + "Python version : 3.10.12\n", + "IPython version : 8.15.0\n", + "\n", + "torch : 2.0.1\n", + "transformers: 4.33.2\n", + "milvus : 2.3.0\n", + "pymilvus : 2.3.0\n", + "langchain : 0.0.301\n", + "\n", + "conda environment: py310\n", + "\n" + ] + } + ], + "source": [ + "# Props to Sebastian Raschka for this handy watermark.\n", + "# !pip install watermark\n", + "\n", + "%load_ext watermark\n", + "%watermark -a 'Christy Bergman' -v -p torch,transformers,milvus,pymilvus,langchain --conda" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/notebooks/text/imdb_utilities.py b/notebooks/text/imdb_utilities.py new file mode 100644 index 000000000..d200e38ba --- /dev/null +++ b/notebooks/text/imdb_utilities.py @@ -0,0 +1,98 @@ +import numpy as np +import torch +from torch.nn import functional as F + +# Output words instead of scores. +def sentiment_score_to_name(score: float): + if score > 0: + return "Positive" + elif score <= 0: + return "Negative" + +# Split data into train, valid, test. +def partition_dataset(df_input, smoke_test=False): + """Splits data, assuming original, input dataframe contains 50K rows. + + Args: + df_input (pandas.DataFrame): input data frame + smoke_test (boolean): if True, use smaller number of rows for testing + + Returns: + df_train, df_val, df_test (pandas.DataFrame): train, valid, test splits. + """ + + # Shuffle data and split into train/val/test. + df_shuffled = df_input.sample(frac=1, random_state=1).reset_index() + # Add a corpus index. + columns = ['movie_index', 'text', 'label_int', 'label'] + df_shuffled.columns = columns + + df_train = df_shuffled.iloc[:35_000] + df_val = df_shuffled.iloc[35_000:40_000] + df_test = df_shuffled.iloc[40_000:] + + # Save train/val/test split data locally in separate files. + df_train.to_csv("train.csv", index=False, encoding="utf-8") + df_val.to_csv("val.csv", index=False, encoding="utf-8") + df_test.to_csv("test.csv", index=False, encoding="utf-8") + + return df_shuffled, df_train, df_val, df_test + +# Take as input a user query and conduct semantic vector search using the query. +def mc_search_imdb(query, retriever, milvus_collection, search_params, top_k, + milvus_client=False, COLLECTION_NAME = 'movies'): + + # Embed the query using same embedding model used to create the Milvus collection. + query_embeddings = torch.tensor(retriever.encode(query)) + # Normalize embeddings to unit length. + query_embeddings = F.normalize(query_embeddings, p=2, dim=1) + # Quick check if embeddings are normalized. + norms = np.linalg.norm(query_embeddings, axis=1) + assert np.allclose(norms, 1.0, atol=1e-5) == True + # Convert the embeddings to list of list of np.float32. + query_embeddings = list(map(np.float32, query_embeddings)) + + # Run semantic vector search using your query and the vector database. + # Assemble results. + distances = [] + texts = [] + movie_indexes = [] + labels = [] + if milvus_client: + # MilvusClient search API call slightly different. + results = milvus_collection.search( + COLLECTION_NAME, + data=query_embeddings, + search_params=search_params, + output_fields=["movie_index", "chunk", "label"], + limit=top_k, + consistency_level="Eventually", + ) + # Results returned from MilvusClient are in the form list of lists of dicts. + for result in results[0]: + distances.append(result['distance']) + texts.append(result['entity']['chunk']) + movie_indexes.append(result['entity']['movie_index']) + labels.append(result['entity']['label']) + else: + # Milvus server search API call. + results = milvus_collection.search( + data=query_embeddings, + anns_field="vector", + param=search_params, + output_fields=["movie_index", "chunk", "label"], + limit=top_k, + consistency_level="Eventually" + ) + # Assemble results from Milvus server. + distances = results[0].distances + for result in results[0]: + texts.append(result.entity.get("chunk")) + movie_indexes.append(result.entity.get("movie_index")) + labels.append(result.entity.get("label")) + + # Assemble all the results in a zipped list. + formatted_results = list(zip(distances, movie_indexes, texts, labels)) + + return formatted_results +