diff --git a/docs/mkdocs.yml b/docs/mkdocs.yml index 0f5a449075..54f1ac4031 100644 --- a/docs/mkdocs.yml +++ b/docs/mkdocs.yml @@ -125,10 +125,11 @@ nav: - DuckDB: python/duckdb.md - LangChain: - LangChain 🔗: integrations/langchain.md + - LangChain demo: notebooks/langchain_demo.ipynb - LangChain JS/TS 🔗: https://js.langchain.com/docs/integrations/vectorstores/lancedb - LlamaIndex 🦙: - LlamaIndex docs: integrations/llamaIndex.md - - LlamaIndex demo: https://docs.llamaindex.ai/en/stable/examples/vector_stores/LanceDBIndexDemo/ + - LlamaIndex demo: notebooks/llamaIndex_demo.ipynb - Pydantic: python/pydantic.md - Voxel51: integrations/voxel51.md - PromptTools: integrations/prompttools.md @@ -204,9 +205,9 @@ nav: - Pandas and PyArrow: python/pandas_and_pyarrow.md - Polars: python/polars_arrow.md - DuckDB: python/duckdb.md - - LangChain 🦜️🔗↗: https://python.langchain.com/docs/integrations/vectorstores/lancedb + - LangChain 🦜️🔗↗: integrations/langchain.md - LangChain.js 🦜️🔗↗: https://js.langchain.com/docs/integrations/vectorstores/lancedb - - LlamaIndex 🦙↗: https://gpt-index.readthedocs.io/en/latest/examples/vector_stores/LanceDBIndexDemo.html + - LlamaIndex 🦙↗: integrations/llamaIndex.md - Pydantic: python/pydantic.md - Voxel51: integrations/voxel51.md - PromptTools: integrations/prompttools.md diff --git a/docs/src/integrations/langchain.md b/docs/src/integrations/langchain.md index fd754cda0f..ec3dbe22d3 100644 --- a/docs/src/integrations/langchain.md +++ b/docs/src/integrations/langchain.md @@ -2,7 +2,7 @@ ![Illustration](../assets/langchain.png) ## Quick Start -You can load your document data using langchain's loaders, for this example we are using `TextLoader` and `OpenAIEmbeddings` as the embedding model. +You can load your document data using langchain's loaders, for this example we are using `TextLoader` and `OpenAIEmbeddings` as the embedding model. Checkout Complete example here - [LangChain demo](../notebooks/langchain_example.ipynb) ```python import os from langchain.document_loaders import TextLoader @@ -38,6 +38,8 @@ The exhaustive list of parameters for `LanceDB` vector store are : - `api_key`: (Optional) API key to use for LanceDB cloud database. Defaults to `None`. - `region`: (Optional) Region to use for LanceDB cloud database. Only for LanceDB Cloud, defaults to `None`. - `mode`: (Optional) Mode to use for adding data to the table. Defaults to `'overwrite'`. +- `reranker`: (Optional) The reranker to use for LanceDB. +- `relevance_score_fn`: (Optional[Callable[[float], float]]) Langchain relevance score function to be used. Defaults to `None`. ```python db_url = "db://lang_test" # url of db you created @@ -54,12 +56,14 @@ vector_store = LanceDB( ``` ### Methods -To add texts and store respective embeddings automatically: + ##### add_texts() - `texts`: `Iterable` of strings to add to the vectorstore. - `metadatas`: Optional `list[dict()]` of metadatas associated with the texts. - `ids`: Optional `list` of ids to associate with the texts. +- `kwargs`: `Any` +This method adds texts and stores respective embeddings automatically. ```python vector_store.add_texts(texts = ['test_123'], metadatas =[{'source' :'wiki'}]) @@ -74,7 +78,6 @@ pd_df.to_csv("docsearch.csv", index=False) # you can also create a new vector store object using an older connection object: vector_store = LanceDB(connection=tbl, embedding=embeddings) ``` -For index creation make sure your table has enough data in it. An ANN index is ususally not needed for datasets ~100K vectors. For large-scale (>1M) or higher dimension vectors, it is beneficial to create an ANN index. ##### create_index() - `col_name`: `Optional[str] = None` - `vector_col`: `Optional[str] = None` @@ -82,6 +85,8 @@ For index creation make sure your table has enough data in it. An ANN index is u - `num_sub_vectors`: `Optional[int] = 96` - `index_cache_size`: `Optional[int] = None` +This method creates an index for the vector store. For index creation make sure your table has enough data in it. An ANN index is ususally not needed for datasets ~100K vectors. For large-scale (>1M) or higher dimension vectors, it is beneficial to create an ANN index. + ```python # for creating vector index vector_store.create_index(vector_col='vector', metric = 'cosine') @@ -89,4 +94,108 @@ vector_store.create_index(vector_col='vector', metric = 'cosine') # for creating scalar index(for non-vector columns) vector_store.create_index(col_name='text') -``` \ No newline at end of file +``` + +##### similarity_search() +- `query`: `str` +- `k`: `Optional[int] = None` +- `filter`: `Optional[Dict[str, str]] = None` +- `fts`: `Optional[bool] = False` +- `name`: `Optional[str] = None` +- `kwargs`: `Any` + +Return documents most similar to the query without relevance scores + +```python +docs = docsearch.similarity_search(query) +print(docs[0].page_content) +``` + +##### similarity_search_by_vector() +- `embedding`: `List[float]` +- `k`: `Optional[int] = None` +- `filter`: `Optional[Dict[str, str]] = None` +- `name`: `Optional[str] = None` +- `kwargs`: `Any` + +Returns documents most similar to the query vector. + +```python +docs = docsearch.similarity_search_by_vector(query) +print(docs[0].page_content) +``` + +##### similarity_search_with_score() +- `query`: `str` +- `k`: `Optional[int] = None` +- `filter`: `Optional[Dict[str, str]] = None` +- `kwargs`: `Any` + +Returns documents most similar to the query string with relevance scores, gets called by base class's `similarity_search_with_relevance_scores` which selects relevance score based on our `_select_relevance_score_fn`. + +```python +docs = docsearch.similarity_search_with_relevance_scores(query) +print("relevance score - ", docs[0][1]) +print("text- ", docs[0][0].page_content[:1000]) +``` + +##### similarity_search_by_vector_with_relevance_scores() +- `embedding`: `List[float]` +- `k`: `Optional[int] = None` +- `filter`: `Optional[Dict[str, str]] = None` +- `name`: `Optional[str] = None` +- `kwargs`: `Any` + +Return documents most similar to the query vector with relevance scores. +Relevance score + +```python +docs = docsearch.similarity_search_by_vector_with_relevance_scores(query_embedding) +print("relevance score - ", docs[0][1]) +print("text- ", docs[0][0].page_content[:1000]) +``` + +##### max_marginal_relevance_search() +- `query`: `str` +- `k`: `Optional[int] = None` +- `fetch_k` : Number of Documents to fetch to pass to MMR algorithm, `Optional[int] = None` +- `lambda_mult`: Number between 0 and 1 that determines the degree + of diversity among the results with 0 corresponding + to maximum diversity and 1 to minimum diversity. + Defaults to 0.5. `float = 0.5` +- `filter`: `Optional[Dict[str, str]] = None` +- `kwargs`: `Any` + +Returns docs selected using the maximal marginal relevance(MMR). +Maximal marginal relevance optimizes for similarity to query AND diversity among selected documents. + +Similarly, `max_marginal_relevance_search_by_vector()` function returns docs most similar to the embedding passed to the function using MMR. instead of a string query you need to pass the embedding to be searched for. + +```python +result = docsearch.max_marginal_relevance_search( + query="text" + ) +result_texts = [doc.page_content for doc in result] +print(result_texts) + +## search by vector : +result = docsearch.max_marginal_relevance_search_by_vector( + embeddings.embed_query("text") + ) +result_texts = [doc.page_content for doc in result] +print(result_texts) +``` + +##### add_images() +- `uris` : File path to the image. `List[str]`. +- `metadatas` : Optional list of metadatas. `(Optional[List[dict]], optional)` +- `ids` : Optional list of IDs. `(Optional[List[str]], optional)` + +Adds images by automatically creating their embeddings and adds them to the vectorstore. + +```python +vec_store.add_images(uris=image_uris) +# here image_uris are local fs paths to the images. +``` + + diff --git a/docs/src/integrations/llamaIndex.md b/docs/src/integrations/llamaIndex.md index 7647e5dc09..210388ac44 100644 --- a/docs/src/integrations/llamaIndex.md +++ b/docs/src/integrations/llamaIndex.md @@ -2,7 +2,8 @@ ![Illustration](../assets/llama-index.jpg) ## Quick start -You would need to install the integration via `pip install llama-index-vector-stores-lancedb` in order to use it. You can run the below script to try it out : +You would need to install the integration via `pip install llama-index-vector-stores-lancedb` in order to use it. +You can run the below script to try it out : ```python import logging import sys @@ -43,6 +44,8 @@ retriever = index.as_retriever(vector_store_kwargs={"where": lance_filter}) response = retriever.retrieve("What did the author do growing up?") ``` +Checkout Complete example here - [LlamaIndex demo](../notebooks/LlamaIndex_example.ipynb) + ### Filtering For metadata filtering, you can use a Lance SQL-like string filter as demonstrated in the example above. Additionally, you can also filter using the `MetadataFilters` class from LlamaIndex: ```python diff --git a/docs/src/notebooks/LlamaIndex_example.ipynb b/docs/src/notebooks/LlamaIndex_example.ipynb new file mode 100644 index 0000000000..887b585e93 --- /dev/null +++ b/docs/src/notebooks/LlamaIndex_example.ipynb @@ -0,0 +1,538 @@ +{ + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "id": "2db56c9b", + "metadata": {}, + "source": [ + "\"Open" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "db0855d0", + "metadata": {}, + "source": [ + "# LanceDB Vector Store\n", + "In this notebook we are going to show how to use [LanceDB](https://www.lancedb.com) to perform vector searches in LlamaIndex" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "f44170b2", + "metadata": {}, + "source": [ + "If you're opening this Notebook on colab, you will probably need to install LlamaIndex 🦙." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6c84199c", + "metadata": {}, + "outputs": [], + "source": [ + "%pip install llama-index llama-index-vector-stores-lancedb" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1a90ce34", + "metadata": {}, + "outputs": [], + "source": [ + "%pip install lancedb==0.6.13 #Only required if the above cell installs an older version of lancedb (pypi package may not be released yet)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "39c62671", + "metadata": {}, + "outputs": [], + "source": [ + "# Refresh vector store URI if restarting or re-using the same notebook\n", + "! rm -rf ./lancedb" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "59b54276", + "metadata": {}, + "outputs": [], + "source": [ + "import logging\n", + "import sys\n", + "\n", + "# Uncomment to see debug logs\n", + "# logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)\n", + "# logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))\n", + "\n", + "\n", + "from llama_index.core import SimpleDirectoryReader, Document, StorageContext\n", + "from llama_index.core import VectorStoreIndex\n", + "from llama_index.vector_stores.lancedb import LanceDBVectorStore\n", + "import textwrap" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "26c71b6d", + "metadata": {}, + "source": [ + "### Setup OpenAI\n", + "The first step is to configure the openai key. It will be used to created embeddings for the documents loaded into the index" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "67b86621", + "metadata": {}, + "outputs": [], + "source": [ + "import openai\n", + "\n", + "openai.api_key = \"sk-\"" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "073f0a68", + "metadata": {}, + "source": [ + "Download Data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "eef1b911", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "--2024-06-11 16:42:37-- https://raw.githubusercontent.com/run-llama/llama_index/main/docs/docs/examples/data/paul_graham/paul_graham_essay.txt\n", + "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.110.133, 185.199.108.133, ...\n", + "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 75042 (73K) [text/plain]\n", + "Saving to: ‘data/paul_graham/paul_graham_essay.txt’\n", + "\n", + "data/paul_graham/pa 100%[===================>] 73.28K --.-KB/s in 0.02s \n", + "\n", + "2024-06-11 16:42:37 (3.97 MB/s) - ‘data/paul_graham/paul_graham_essay.txt’ saved [75042/75042]\n", + "\n" + ] + } + ], + "source": [ + "!mkdir -p 'data/paul_graham/'\n", + "!wget 'https://raw.githubusercontent.com/run-llama/llama_index/main/docs/docs/examples/data/paul_graham/paul_graham_essay.txt' -O 'data/paul_graham/paul_graham_essay.txt'" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "f7010b1d-d1bb-4f08-9309-a328bb4ea396", + "metadata": {}, + "source": [ + "### Loading documents\n", + "Load the documents stored in the `data/paul_graham/` using the SimpleDirectoryReader" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c154dd4b", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Document ID: cac1ba78-5007-4cf8-89ba-280264790115 Document Hash: fe2d4d3ef3a860780f6c2599808caa587c8be6516fe0ba4ca53cf117044ba953\n" + ] + } + ], + "source": [ + "documents = SimpleDirectoryReader(\"./data/paul_graham/\").load_data()\n", + "print(\"Document ID:\", documents[0].doc_id, \"Document Hash:\", documents[0].hash)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "c0232fd1", + "metadata": {}, + "source": [ + "### Create the index\n", + "Here we create an index backed by LanceDB using the documents loaded previously. LanceDBVectorStore takes a few arguments.\n", + "- uri (str, required): Location where LanceDB will store its files.\n", + "- table_name (str, optional): The table name where the embeddings will be stored. Defaults to \"vectors\".\n", + "- nprobes (int, optional): The number of probes used. A higher number makes search more accurate but also slower. Defaults to 20.\n", + "- refine_factor: (int, optional): Refine the results by reading extra elements and re-ranking them in memory. Defaults to None\n", + "\n", + "- More details can be found at [LanceDB docs](https://lancedb.github.io/lancedb/ann_indexes)" + ] + }, + { + "cell_type": "markdown", + "id": "1f2e20ef", + "metadata": {}, + "source": [ + "##### For LanceDB cloud :\n", + "```python\n", + "vector_store = LanceDBVectorStore( \n", + " uri=\"db://db_name\", # your remote DB URI\n", + " api_key=\"sk_..\", # lancedb cloud api key\n", + " region=\"your-region\" # the region you configured\n", + " ...\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8731da62", + "metadata": {}, + "outputs": [], + "source": [ + "vector_store = LanceDBVectorStore(\n", + " uri=\"./lancedb\", mode=\"overwrite\", query_type=\"hybrid\"\n", + ")\n", + "storage_context = StorageContext.from_defaults(vector_store=vector_store)\n", + "\n", + "index = VectorStoreIndex.from_documents(\n", + " documents, storage_context=storage_context\n", + ")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "8ee4473a-094f-4d0a-a825-e1213db07240", + "metadata": {}, + "source": [ + "### Query the index\n", + "We can now ask questions using our index. We can use filtering via `MetadataFilters` or use native lance `where` clause." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5eb6419b", + "metadata": {}, + "outputs": [], + "source": [ + "from llama_index.core.vector_stores import (\n", + " MetadataFilters,\n", + " FilterOperator,\n", + " FilterCondition,\n", + " MetadataFilter,\n", + ")\n", + "\n", + "from datetime import datetime\n", + "\n", + "\n", + "query_filters = MetadataFilters(\n", + " filters=[\n", + " MetadataFilter(\n", + " key=\"creation_date\",\n", + " operator=FilterOperator.EQ,\n", + " value=datetime.now().strftime(\"%Y-%m-%d\"),\n", + " ),\n", + " MetadataFilter(\n", + " key=\"file_size\", value=75040, operator=FilterOperator.GT\n", + " ),\n", + " ],\n", + " condition=FilterCondition.AND,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "ee201930", + "metadata": {}, + "source": [ + "### Hybrid Search\n", + "\n", + "LanceDB offers hybrid search with reranking capabilities. For complete documentation, refer [here](https://lancedb.github.io/lancedb/hybrid_search/hybrid_search/).\n", + "\n", + "This example uses the `colbert` reranker. The following cell installs the necessary dependencies for `colbert`. If you choose a different reranker, make sure to adjust the dependencies accordingly." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e12d1454", + "metadata": {}, + "outputs": [], + "source": [ + "! pip install -U torch transformers tantivy@git+https://github.com/quickwit-oss/tantivy-py#164adc87e1a033117001cf70e38c82a53014d985" + ] + }, + { + "cell_type": "markdown", + "id": "c742cb07", + "metadata": {}, + "source": [ + "if you want to add a reranker at vector store initialization, you can pass it in the arguments like below :\n", + "```\n", + "from lancedb.rerankers import ColbertReranker\n", + "reranker = ColbertReranker()\n", + "vector_store = LanceDBVectorStore(uri=\"./lancedb\", reranker=reranker, mode=\"overwrite\")\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "27ea047b", + "metadata": {}, + "outputs": [], + "source": [ + "import lancedb" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8414517f", + "metadata": {}, + "outputs": [], + "source": [ + "from lancedb.rerankers import ColbertReranker\n", + "\n", + "reranker = ColbertReranker()\n", + "vector_store._add_reranker(reranker)\n", + "\n", + "query_engine = index.as_query_engine(\n", + " filters=query_filters,\n", + " # vector_store_kwargs={\n", + " # \"query_type\": \"fts\",\n", + " # },\n", + ")\n", + "\n", + "response = query_engine.query(\"How much did Viaweb charge per month?\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dc6ccb7a", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Viaweb charged $100 a month for a small store and $300 a month for a big one.\n", + "metadata - {'65ed5f07-5b8a-4143-a939-e8764884828e': {'file_path': '/Users/raghavdixit/Desktop/open_source/llama_index_lance/docs/docs/examples/vector_stores/data/paul_graham/paul_graham_essay.txt', 'file_name': 'paul_graham_essay.txt', 'file_type': 'text/plain', 'file_size': 75042, 'creation_date': '2024-06-11', 'last_modified_date': '2024-06-11'}, 'be231827-20b8-4988-ac75-94fa79b3c22e': {'file_path': '/Users/raghavdixit/Desktop/open_source/llama_index_lance/docs/docs/examples/vector_stores/data/paul_graham/paul_graham_essay.txt', 'file_name': 'paul_graham_essay.txt', 'file_type': 'text/plain', 'file_size': 75042, 'creation_date': '2024-06-11', 'last_modified_date': '2024-06-11'}}\n" + ] + } + ], + "source": [ + "print(response)\n", + "print(\"metadata -\", response.metadata)" + ] + }, + { + "cell_type": "markdown", + "id": "0c1c6c73", + "metadata": {}, + "source": [ + "##### lance filters(SQL like) directly via the `where` clause :" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0a2bcc07", + "metadata": {}, + "outputs": [], + "source": [ + "lance_filter = \"metadata.file_name = 'paul_graham_essay.txt' \"\n", + "retriever = index.as_retriever(vector_store_kwargs={\"where\": lance_filter})\n", + "response = retriever.retrieve(\"What did the author do growing up?\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7ac47cf9", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "What I Worked On\n", + "\n", + "February 2021\n", + "\n", + "Before college the two main things I worked on, outside of school, were writing and programming. I didn't write essays. I wrote what beginning writers were supposed to write then, and probably still are: short stories. My stories were awful. They had hardly any plot, just characters with strong feelings, which I imagined made them deep.\n", + "\n", + "The first programs I tried writing were on the IBM 1401 that our school district used for what was then called \"data processing.\" This was in 9th grade, so I was 13 or 14. The school district's 1401 happened to be in the basement of our junior high school, and my friend Rich Draves and I got permission to use it. It was like a mini Bond villain's lair down there, with all these alien-looking machines — CPU, disk drives, printer, card reader — sitting up on a raised floor under bright fluorescent lights.\n", + "\n", + "The language we used was an early version of Fortran. You had to type programs on punch cards, then stack them in the card reader and press a button to load the program into memory and run it. The result would ordinarily be to print something on the spectacularly loud printer.\n", + "\n", + "I was puzzled by the 1401. I couldn't figure out what to do with it. And in retrospect there's not much I could have done with it. The only form of input to programs was data stored on punched cards, and I didn't have any data stored on punched cards. The only other option was to do things that didn't rely on any input, like calculate approximations of pi, but I didn't know enough math to do anything interesting of that type. So I'm not surprised I can't remember any programs I wrote, because they can't have done much. My clearest memory is of the moment I learned it was possible for programs not to terminate, when one of mine didn't. On a machine without time-sharing, this was a social as well as a technical error, as the data center manager's expression made clear.\n", + "\n", + "With microcomputers, everything changed. Now you could have a computer sitting right in front of you, on a desk, that could respond to your keystrokes as it was running instead of just churning through a stack of punch cards and then stopping. [1]\n", + "\n", + "The first of my friends to get a microcomputer built it himself. It was sold as a kit by Heathkit. I remember vividly how impressed and envious I felt watching him sitting in front of it, typing programs right into the computer.\n", + "\n", + "Computers were expensive in those days and it took me years of nagging before I convinced my father to buy one, a TRS-80, in about 1980. The gold standard then was the Apple II, but a TRS-80 was good enough. This was when I really started programming. I wrote simple games, a program to predict how high my model rockets would fly, and a word processor that my father used to write at least one book. There was only room in memory for about 2 pages of text, so he'd write 2 pages at a time and then print them out, but it was a lot better than a typewriter.\n", + "\n", + "Though I liked programming, I didn't plan to study it in college. In college I was going to study philosophy, which sounded much more powerful. It seemed, to my naive high school self, to be the study of the ultimate truths, compared to which the things studied in other fields would be mere domain knowledge. What I discovered when I got to college was that the other fields took up so much of the space of ideas that there wasn't much left for these supposed ultimate truths. All that seemed left for philosophy were edge cases that people in other fields felt could safely be ignored.\n", + "\n", + "I couldn't have put this into words when I was 18. All I knew at the time was that I kept taking philosophy courses and they kept being boring. So I decided to switch to AI.\n", + "\n", + "AI was in the air in the mid 1980s, but there were two things especially that made me want to work on it: a novel by Heinlein called The Moon is a Harsh Mistress, which featured an intelligent computer called Mike, and a PBS documentary that showed Terry Winograd using SHRDLU. I haven't tried rereading The Moon is a Harsh Mistress, so I don't know how well it has aged, but when I read it I was drawn entirely into its world.\n", + "metadata - {'file_path': '/Users/raghavdixit/Desktop/open_source/llama_index_lance/docs/docs/examples/vector_stores/data/paul_graham/paul_graham_essay.txt', 'file_name': 'paul_graham_essay.txt', 'file_type': 'text/plain', 'file_size': 75042, 'creation_date': '2024-06-11', 'last_modified_date': '2024-06-11'}\n" + ] + } + ], + "source": [ + "print(response[0].get_content())\n", + "print(\"metadata -\", response[0].metadata)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "6afc84ac", + "metadata": {}, + "source": [ + "### Appending data\n", + "You can also add data to an existing index" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "759a532e", + "metadata": {}, + "outputs": [], + "source": [ + "nodes = [node.node for node in response]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "069fc099", + "metadata": {}, + "outputs": [], + "source": [ + "del index\n", + "\n", + "index = VectorStoreIndex.from_documents(\n", + " [Document(text=\"The sky is purple in Portland, Maine\")],\n", + " uri=\"/tmp/new_dataset\",\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a64ed441", + "metadata": {}, + "outputs": [], + "source": [ + "index.insert_nodes(nodes)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b5cffcfe", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Portland, Maine\n" + ] + } + ], + "source": [ + "query_engine = index.as_query_engine()\n", + "response = query_engine.query(\"Where is the sky purple?\")\n", + "print(textwrap.fill(str(response), 100))" + ] + }, + { + "cell_type": "markdown", + "id": "ec548a02", + "metadata": {}, + "source": [ + "You can also create an index from an existing table" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dc99404d", + "metadata": {}, + "outputs": [], + "source": [ + "del index\n", + "\n", + "vec_store = LanceDBVectorStore.from_table(vector_store._table)\n", + "index = VectorStoreIndex.from_vector_store(vec_store)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7b2e8cca", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The author started Viaweb and Aspra.\n" + ] + } + ], + "source": [ + "query_engine = index.as_query_engine()\n", + "response = query_engine.query(\"What companies did the author start?\")\n", + "print(textwrap.fill(str(response), 100))" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/docs/src/notebooks/langchain_example.ipynb b/docs/src/notebooks/langchain_example.ipynb new file mode 100644 index 0000000000..10a48b4f00 --- /dev/null +++ b/docs/src/notebooks/langchain_example.ipynb @@ -0,0 +1,566 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "683953b3", + "metadata": {}, + "source": [ + "# LanceDB\n", + "\n", + ">[LanceDB](https://lancedb.com/) is an open-source database for vector-search built with persistent storage, which greatly simplifies retrevial, filtering and management of embeddings. Fully open source.\n", + "\n", + "This notebook shows how to use functionality related to the `LanceDB` vector database based on the Lance data format." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b1051ba9", + "metadata": {}, + "outputs": [], + "source": [ + "! pip install tantivy" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "88ac92c0", + "metadata": {}, + "outputs": [], + "source": [ + "! pip install -U langchain-openai langchain-community" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5a1c84d6-a10f-428c-95cd-46d3a1702e07", + "metadata": {}, + "outputs": [], + "source": [ + "! pip install lancedb" + ] + }, + { + "cell_type": "markdown", + "id": "99134dd1-b91e-486f-8d90-534248e43b9d", + "metadata": {}, + "source": [ + "We want to use OpenAIEmbeddings so we have to get the OpenAI API Key. " + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "a0361f5c-e6f4-45f4-b829-11680cf03cec", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "import getpass\n", + "import os\n", + "\n", + "os.environ[\"OPENAI_API_KEY\"] = getpass.getpass(\"OpenAI API Key:\")" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "d114ed78", + "metadata": {}, + "outputs": [], + "source": [ + "! rm -rf /tmp/lancedb" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "a3c3999a", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain_community.document_loaders import TextLoader\n", + "from langchain_community.vectorstores import LanceDB\n", + "from langchain_openai import OpenAIEmbeddings\n", + "from langchain_text_splitters import CharacterTextSplitter\n", + "\n", + "loader = TextLoader(\"../../how_to/state_of_the_union.txt\")\n", + "documents = loader.load()\n", + "\n", + "documents = CharacterTextSplitter().split_documents(documents)\n", + "embeddings = OpenAIEmbeddings()" + ] + }, + { + "cell_type": "markdown", + "id": "e9517bb0", + "metadata": {}, + "source": [ + "##### For LanceDB cloud, you can invoke the vector store as follows :\n", + "\n", + "\n", + "```python\n", + "db_url = \"db://lang_test\" # url of db you created\n", + "api_key = \"xxxxx\" # your API key\n", + "region=\"us-east-1-dev\" # your selected region\n", + "\n", + "vector_store = LanceDB(\n", + " uri=db_url,\n", + " api_key=api_key,\n", + " region=region,\n", + " embedding=embeddings,\n", + " table_name='langchain_test'\n", + " )\n", + "```\n", + "\n", + "You can also add `region`, `api_key`, `uri` to `from_documents()` classmethod\n" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "6e104aee", + "metadata": {}, + "outputs": [], + "source": [ + "from lancedb.rerankers import LinearCombinationReranker\n", + "\n", + "reranker = LinearCombinationReranker(weight=0.3)\n", + "\n", + "docsearch = LanceDB.from_documents(documents, embeddings, reranker=reranker)\n", + "query = \"What did the president say about Ketanji Brown Jackson\"" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "id": "259c7988", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "relevance score - 0.7066475030191711\n", + "text- They were responding to a 9-1-1 call when a man shot and killed them with a stolen gun. \n", + "\n", + "Officer Mora was 27 years old. \n", + "\n", + "Officer Rivera was 22. \n", + "\n", + "Both Dominican Americans who’d grown up on the same streets they later chose to patrol as police officers. \n", + "\n", + "I spoke with their families and told them that we are forever in debt for their sacrifice, and we will carry on their mission to restore the trust and safety every community deserves. \n", + "\n", + "I’ve worked on these issues a long time. \n", + "\n", + "I know what works: Investing in crime prevention and community police officers who’ll walk the beat, who’ll know the neighborhood, and who can restore trust and safety. \n", + "\n", + "So let’s not abandon our streets. Or choose between safety and equal justice. \n", + "\n", + "Let’s come together to protect our communities, restore trust, and hold law enforcement accountable. \n", + "\n", + "That’s why the Justice Department required body cameras, banned chokeholds, and restricted no-knock warrants for its officers. \n", + "\n", + "That’s why the American Rescue \n" + ] + } + ], + "source": [ + "docs = docsearch.similarity_search_with_relevance_scores(query)\n", + "print(\"relevance score - \", docs[0][1])\n", + "print(\"text- \", docs[0][0].page_content[:1000])" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "id": "9fa29dae", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "distance - 0.30000001192092896\n", + "text- My administration is providing assistance with job training and housing, and now helping lower-income veterans get VA care debt-free. \n", + "\n", + "Our troops in Iraq and Afghanistan faced many dangers. \n", + "\n", + "One was stationed at bases and breathing in toxic smoke from “burn pits” that incinerated wastes of war—medical and hazard material, jet fuel, and more. \n", + "\n", + "When they came home, many of the world’s fittest and best trained warriors were never the same. \n", + "\n", + "Headaches. Numbness. Dizziness. \n", + "\n", + "A cancer that would put them in a flag-draped coffin. \n", + "\n", + "I know. \n", + "\n", + "One of those soldiers was my son Major Beau Biden. \n", + "\n", + "We don’t know for sure if a burn pit was the cause of his brain cancer, or the diseases of so many of our troops. \n", + "\n", + "But I’m committed to finding out everything we can. \n", + "\n", + "Committed to military families like Danielle Robinson from Ohio. \n", + "\n", + "The widow of Sergeant First Class Heath Robinson. \n", + "\n", + "He was born a soldier. Army National Guard. Combat medic in Kosovo and Iraq. \n", + "\n", + "Stationed near Baghdad, just ya\n" + ] + } + ], + "source": [ + "docs = docsearch.similarity_search_with_score(query=\"Headaches\", query_type=\"hybrid\")\n", + "print(\"distance - \", docs[0][1])\n", + "print(\"text- \", docs[0][0].page_content[:1000])" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "e70ad201", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "reranker : \n" + ] + } + ], + "source": [ + "print(\"reranker : \", docsearch._reranker)" + ] + }, + { + "cell_type": "markdown", + "id": "f5e1cdfd", + "metadata": {}, + "source": [ + "Additionaly, to explore the table you can load it into a df or save it in a csv file: \n", + "```python\n", + "tbl = docsearch.get_table()\n", + "print(\"tbl:\", tbl)\n", + "pd_df = tbl.to_pandas()\n", + "# pd_df.to_csv(\"docsearch.csv\", index=False)\n", + "\n", + "# you can also create a new vector store object using an older connection object:\n", + "vector_store = LanceDB(connection=tbl, embedding=embeddings)\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "9c608226", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "metadata : {'source': '../../how_to/state_of_the_union.txt'}\n", + "\n", + "SQL filtering :\n", + "\n", + "They were responding to a 9-1-1 call when a man shot and killed them with a stolen gun. \n", + "\n", + "Officer Mora was 27 years old. \n", + "\n", + "Officer Rivera was 22. \n", + "\n", + "Both Dominican Americans who’d grown up on the same streets they later chose to patrol as police officers. \n", + "\n", + "I spoke with their families and told them that we are forever in debt for their sacrifice, and we will carry on their mission to restore the trust and safety every community deserves. \n", + "\n", + "I’ve worked on these issues a long time. \n", + "\n", + "I know what works: Investing in crime prevention and community police officers who’ll walk the beat, who’ll know the neighborhood, and who can restore trust and safety. \n", + "\n", + "So let’s not abandon our streets. Or choose between safety and equal justice. \n", + "\n", + "Let’s come together to protect our communities, restore trust, and hold law enforcement accountable. \n", + "\n", + "That’s why the Justice Department required body cameras, banned chokeholds, and restricted no-knock warrants for its officers. \n", + "\n", + "That’s why the American Rescue Plan provided $350 Billion that cities, states, and counties can use to hire more police and invest in proven strategies like community violence interruption—trusted messengers breaking the cycle of violence and trauma and giving young people hope. \n", + "\n", + "We should all agree: The answer is not to Defund the police. The answer is to FUND the police with the resources and training they need to protect our communities. \n", + "\n", + "I ask Democrats and Republicans alike: Pass my budget and keep our neighborhoods safe. \n", + "\n", + "And I will keep doing everything in my power to crack down on gun trafficking and ghost guns you can buy online and make at home—they have no serial numbers and can’t be traced. \n", + "\n", + "And I ask Congress to pass proven measures to reduce gun violence. Pass universal background checks. Why should anyone on a terrorist list be able to purchase a weapon? \n", + "\n", + "Ban assault weapons and high-capacity magazines. \n", + "\n", + "Repeal the liability shield that makes gun manufacturers the only industry in America that can’t be sued. \n", + "\n", + "These laws don’t infringe on the Second Amendment. They save lives. \n", + "\n", + "The most fundamental right in America is the right to vote – and to have it counted. And it’s under assault. \n", + "\n", + "In state after state, new laws have been passed, not only to suppress the vote, but to subvert entire elections. \n", + "\n", + "We cannot let this happen. \n", + "\n", + "Tonight. I call on the Senate to: Pass the Freedom to Vote Act. Pass the John Lewis Voting Rights Act. And while you’re at it, pass the Disclose Act so Americans can know who is funding our elections. \n", + "\n", + "Tonight, I’d like to honor someone who has dedicated his life to serve this country: Justice Stephen Breyer—an Army veteran, Constitutional scholar, and retiring Justice of the United States Supreme Court. Justice Breyer, thank you for your service. \n", + "\n", + "One of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court. \n", + "\n", + "And I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nation’s top legal minds, who will continue Justice Breyer’s legacy of excellence. \n", + "\n", + "A former top litigator in private practice. A former federal public defender. And from a family of public school educators and police officers. A consensus builder. Since she’s been nominated, she’s received a broad range of support—from the Fraternal Order of Police to former judges appointed by Democrats and Republicans. \n", + "\n", + "And if we are to advance liberty and justice, we need to secure the Border and fix the immigration system. \n", + "\n", + "We can do both. At our border, we’ve installed new technology like cutting-edge scanners to better detect drug smuggling. \n", + "\n", + "We’ve set up joint patrols with Mexico and Guatemala to catch more human traffickers. \n", + "\n", + "We’re putting in place dedicated immigration judges so families fleeing persecution and violence can have their cases heard faster.\n" + ] + } + ], + "source": [ + "docs = docsearch.similarity_search(\n", + " query=query, filter={\"metadata.source\": \"../../how_to/state_of_the_union.txt\"}\n", + ")\n", + "\n", + "print(\"metadata :\", docs[0].metadata)\n", + "\n", + "# or you can directly supply SQL string filters :\n", + "\n", + "print(\"\\nSQL filtering :\\n\")\n", + "docs = docsearch.similarity_search(query=query, filter=\"text LIKE '%Officer Rivera%'\")\n", + "print(docs[0].page_content)" + ] + }, + { + "cell_type": "markdown", + "id": "9a173c94", + "metadata": {}, + "source": [ + "## Adding images " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "05f669d7", + "metadata": {}, + "outputs": [], + "source": [ + "! pip install -U langchain-experimental" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3ed69810", + "metadata": {}, + "outputs": [], + "source": [ + "! pip install open_clip_torch torch" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "2cacb5ee", + "metadata": {}, + "outputs": [], + "source": [ + "! rm -rf '/tmp/multimmodal_lance'" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "b3456e2c", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain_experimental.open_clip import OpenCLIPEmbeddings" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "3848eba2", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "\n", + "import requests\n", + "\n", + "# List of image URLs to download\n", + "image_urls = [\n", + " \"https://github.com/raghavdixit99/assets/assets/34462078/abf47cc4-d979-4aaa-83be-53a2115bf318\",\n", + " \"https://github.com/raghavdixit99/assets/assets/34462078/93be928e-522b-4e37-889d-d4efd54b2112\",\n", + "]\n", + "\n", + "texts = [\"bird\", \"dragon\"]\n", + "\n", + "# Directory to save images\n", + "dir_name = \"./photos/\"\n", + "\n", + "# Create directory if it doesn't exist\n", + "os.makedirs(dir_name, exist_ok=True)\n", + "\n", + "image_uris = []\n", + "# Download and save each image\n", + "for i, url in enumerate(image_urls, start=1):\n", + " response = requests.get(url)\n", + " path = os.path.join(dir_name, f\"image{i}.jpg\")\n", + " image_uris.append(path)\n", + " with open(path, \"wb\") as f:\n", + " f.write(response.content)" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "3d62c2a0", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain_community.vectorstores import LanceDB\n", + "\n", + "vec_store = LanceDB(\n", + " table_name=\"multimodal_test\",\n", + " embedding=OpenCLIPEmbeddings(),\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "ebbb4881", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['b673620b-01f0-42ca-a92e-d033bb92c0a6',\n", + " '99c3a5b0-b577-417a-8177-92f4a655dbfb']" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "vec_store.add_images(uris=image_uris)" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "3c29dea3", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['f7adde5d-a4a3-402b-9e73-088b230722c3',\n", + " 'cbed59da-0aec-4bff-8820-9e59d81a2140']" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "vec_store.add_texts(texts)" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "8b2f25ce", + "metadata": {}, + "outputs": [], + "source": [ + "img_embed = vec_store._embedding.embed_query(\"bird\")" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "87a24079", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Document(page_content='bird', metadata={'id': 'f7adde5d-a4a3-402b-9e73-088b230722c3'})" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "vec_store.similarity_search_by_vector(img_embed)[0]" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "78557867", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "LanceTable(connection=LanceDBConnection(/tmp/lancedb), name=\"multimodal_test\")" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "vec_store._table" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.2" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}