From ab94b6c89245fd46c37287193e6bb297a9f7f198 Mon Sep 17 00:00:00 2001 From: Tuana Celik Date: Thu, 10 Oct 2024 14:50:40 +0200 Subject: [PATCH] adding new cookbook --- README.md | 1 + index.toml | 6 + notebooks/metadata_enrichment.ipynb | 506 ++++++++++++++++++++++++++++ 3 files changed, 513 insertions(+) create mode 100644 notebooks/metadata_enrichment.ipynb diff --git a/README.md b/README.md index a0f932d..382bccc 100644 --- a/README.md +++ b/README.md @@ -24,6 +24,7 @@ For more examples, you may also find our [Blog](https://haystack.deepset.ai/blog | Speaker Diarization with AssemblyAI | Open In Colab| | Advance Prompt Customization for Anthropic | Open In Colab| | Advanced RAG: Query Decomposition and Reasoning | Open In Colab| +| Advanced RAG: Automated Structured Metadata Enrichment | Open In Colab| | Techcrunch News Digest with Local LLMs using TitanML Takeoff | Open In Colab| | Use Gemini Models with Vertex AI| Open In Colab| | Gradient AI Embedders and Generators for RAG | Open In Colab| diff --git a/index.toml b/index.toml index 2de2328..0cebd52 100644 --- a/index.toml +++ b/index.toml @@ -255,3 +255,9 @@ title = "Agentic RAG with Llama 3.2 3B" notebook = "llama32_agentic_rag.ipynb" topics = ["RAG", "Agents", "Web-QA"] new = true + +[[cookbook]] +title = "Advanced RAG: Automated Structured Metadata Enrichment" +notebook = "metadata_enrichment.ipynb" +new = true +topics = ["Advanced Retrieval", "RAG", "Metadata"] \ No newline at end of file diff --git a/notebooks/metadata_enrichment.ipynb b/notebooks/metadata_enrichment.ipynb new file mode 100644 index 0000000..ee95d98 --- /dev/null +++ b/notebooks/metadata_enrichment.ipynb @@ -0,0 +1,506 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "iYhPbdnTN_9v" + }, + "source": [ + "# Advanced RAG: Automated Structured Metadata Enrichment\n", + "\n", + "by Tuana Celik ([LI](https://www.linkedin.com/in/tuanacelik/), [Twitter](https://x.com/tuanacelik))\n", + "\n", + "In this example, you'll see how you can make use of structured outputs which is an option for some LLMs, and a custom Haystack component, to automate the enrichment of metadata from documents.\n", + "\n", + "You will see how you can define your own metadata fields as a Pydantic Model, as well as the data types each field should have. Finally, you will get a custom `MetadataEnricher` to extract the required fields and add them to the document meta information.\n", + "\n", + "In this example, we will be enriching metadata with information relating the funding announements.\n", + "\n", + "Once we populate the metadata of a document with our own fields, we are able to use Metadata Filtering during the retrieval step of RAG pipelines. We can even combine this with [Metadata Extraction from Queries to Improve Retrieval](https://haystack.deepset.ai/blog/extracting-metadata-filter) to be very precise about what documents we are providing as context to an LLM." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "-B7NJJzYZWcv" + }, + "source": [ + "### Install requirements" + ] + }, + { + "cell_type": "code", + "execution_count": 69, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "9mhsRh7zjCQO", + "outputId": "c898ec01-f4e4-444e-8ea3-ea2368be3358" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Requirement already satisfied: haystack-ai in /usr/local/lib/python3.10/dist-packages (2.6.1)\n", + "Requirement already satisfied: haystack-experimental in /usr/local/lib/python3.10/dist-packages (from haystack-ai) (0.2.0)\n", + "Requirement already satisfied: jinja2 in /usr/local/lib/python3.10/dist-packages (from haystack-ai) (3.1.4)\n", + "Requirement already satisfied: lazy-imports in /usr/local/lib/python3.10/dist-packages (from haystack-ai) (0.3.1)\n", + "Requirement already satisfied: more-itertools in /usr/local/lib/python3.10/dist-packages (from haystack-ai) (10.5.0)\n", + "Requirement already satisfied: networkx in /usr/local/lib/python3.10/dist-packages (from haystack-ai) (3.3)\n", + "Requirement already satisfied: numpy<2 in /usr/local/lib/python3.10/dist-packages (from haystack-ai) (1.26.4)\n", + "Requirement already satisfied: openai>=1.1.0 in /usr/local/lib/python3.10/dist-packages (from haystack-ai) (1.51.2)\n", + "Requirement already satisfied: pandas in /usr/local/lib/python3.10/dist-packages (from haystack-ai) (2.2.2)\n", + "Requirement already satisfied: posthog in /usr/local/lib/python3.10/dist-packages (from haystack-ai) (3.7.0)\n", + "Requirement already satisfied: python-dateutil in /usr/local/lib/python3.10/dist-packages (from haystack-ai) (2.8.2)\n", + "Requirement already satisfied: pyyaml in /usr/local/lib/python3.10/dist-packages (from haystack-ai) (6.0.2)\n", + "Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from haystack-ai) (2.32.3)\n", + "Requirement already satisfied: tenacity!=8.4.0 in /usr/local/lib/python3.10/dist-packages (from haystack-ai) (9.0.0)\n", + "Requirement already satisfied: tqdm in /usr/local/lib/python3.10/dist-packages (from haystack-ai) (4.66.5)\n", + "Requirement already satisfied: typing-extensions>=4.7 in /usr/local/lib/python3.10/dist-packages (from haystack-ai) (4.12.2)\n", + "Requirement already satisfied: anyio<5,>=3.5.0 in /usr/local/lib/python3.10/dist-packages (from openai>=1.1.0->haystack-ai) (3.7.1)\n", + "Requirement already satisfied: distro<2,>=1.7.0 in /usr/lib/python3/dist-packages (from openai>=1.1.0->haystack-ai) (1.7.0)\n", + "Requirement already satisfied: httpx<1,>=0.23.0 in /usr/local/lib/python3.10/dist-packages (from openai>=1.1.0->haystack-ai) (0.27.2)\n", + "Requirement already satisfied: jiter<1,>=0.4.0 in /usr/local/lib/python3.10/dist-packages (from openai>=1.1.0->haystack-ai) (0.6.1)\n", + "Requirement already satisfied: pydantic<3,>=1.9.0 in /usr/local/lib/python3.10/dist-packages (from openai>=1.1.0->haystack-ai) (2.9.2)\n", + "Requirement already satisfied: sniffio in /usr/local/lib/python3.10/dist-packages (from openai>=1.1.0->haystack-ai) (1.3.1)\n", + "Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.10/dist-packages (from jinja2->haystack-ai) (2.1.5)\n", + "Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.10/dist-packages (from pandas->haystack-ai) (2024.2)\n", + "Requirement already satisfied: tzdata>=2022.7 in /usr/local/lib/python3.10/dist-packages (from pandas->haystack-ai) (2024.2)\n", + "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil->haystack-ai) (1.16.0)\n", + "Requirement already satisfied: monotonic>=1.5 in /usr/local/lib/python3.10/dist-packages (from posthog->haystack-ai) (1.6)\n", + "Requirement already satisfied: backoff>=1.10.0 in /usr/local/lib/python3.10/dist-packages (from posthog->haystack-ai) (2.2.1)\n", + "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests->haystack-ai) (3.3.2)\n", + "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests->haystack-ai) (3.10)\n", + "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests->haystack-ai) (2.2.3)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests->haystack-ai) (2024.8.30)\n", + "Requirement already satisfied: exceptiongroup in /usr/local/lib/python3.10/dist-packages (from anyio<5,>=3.5.0->openai>=1.1.0->haystack-ai) (1.2.2)\n", + "Requirement already satisfied: httpcore==1.* in /usr/local/lib/python3.10/dist-packages (from httpx<1,>=0.23.0->openai>=1.1.0->haystack-ai) (1.0.6)\n", + "Requirement already satisfied: h11<0.15,>=0.13 in /usr/local/lib/python3.10/dist-packages (from httpcore==1.*->httpx<1,>=0.23.0->openai>=1.1.0->haystack-ai) (0.14.0)\n", + "Requirement already satisfied: annotated-types>=0.6.0 in /usr/local/lib/python3.10/dist-packages (from pydantic<3,>=1.9.0->openai>=1.1.0->haystack-ai) (0.7.0)\n", + "Requirement already satisfied: pydantic-core==2.23.4 in /usr/local/lib/python3.10/dist-packages (from pydantic<3,>=1.9.0->openai>=1.1.0->haystack-ai) (2.23.4)\n", + "Requirement already satisfied: trafilatura in /usr/local/lib/python3.10/dist-packages (1.12.2)\n", + "Requirement already satisfied: certifi in /usr/local/lib/python3.10/dist-packages (from trafilatura) (2024.8.30)\n", + "Requirement already satisfied: courlan>=1.2.0 in /usr/local/lib/python3.10/dist-packages (from trafilatura) (1.3.1)\n", + "Requirement already satisfied: htmldate>=1.8.1 in /usr/local/lib/python3.10/dist-packages (from trafilatura) (1.9.1)\n", + "Requirement already satisfied: justext>=3.0.1 in /usr/local/lib/python3.10/dist-packages (from trafilatura) (3.0.1)\n", + "Requirement already satisfied: lxml>=5.2.2 in /usr/local/lib/python3.10/dist-packages (from trafilatura) (5.3.0)\n", + "Requirement already satisfied: charset-normalizer>=3.2.0 in /usr/local/lib/python3.10/dist-packages (from trafilatura) (3.3.2)\n", + "Requirement already satisfied: urllib3<3,>=1.26 in /usr/local/lib/python3.10/dist-packages (from trafilatura) (2.2.3)\n", + "Requirement already satisfied: babel>=2.16.0 in /usr/local/lib/python3.10/dist-packages (from courlan>=1.2.0->trafilatura) (2.16.0)\n", + "Requirement already satisfied: tld>=0.13 in /usr/local/lib/python3.10/dist-packages (from courlan>=1.2.0->trafilatura) (0.13)\n", + "Requirement already satisfied: dateparser>=1.1.2 in /usr/local/lib/python3.10/dist-packages (from htmldate>=1.8.1->trafilatura) (1.2.0)\n", + "Requirement already satisfied: python-dateutil>=2.8.2 in /usr/local/lib/python3.10/dist-packages (from htmldate>=1.8.1->trafilatura) (2.8.2)\n", + "Requirement already satisfied: pytz in /usr/local/lib/python3.10/dist-packages (from dateparser>=1.1.2->htmldate>=1.8.1->trafilatura) (2024.2)\n", + "Requirement already satisfied: regex!=2019.02.19,!=2021.8.27 in /usr/local/lib/python3.10/dist-packages (from dateparser>=1.1.2->htmldate>=1.8.1->trafilatura) (2024.9.11)\n", + "Requirement already satisfied: tzlocal in /usr/local/lib/python3.10/dist-packages (from dateparser>=1.1.2->htmldate>=1.8.1->trafilatura) (5.2)\n", + "Requirement already satisfied: lxml-html-clean in /usr/local/lib/python3.10/dist-packages (from lxml[html_clean]>=4.4.2->justext>=3.0.1->trafilatura) (0.3.1)\n", + "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil>=2.8.2->htmldate>=1.8.1->trafilatura) (1.16.0)\n" + ] + } + ], + "source": [ + "!pip install haystack-ai\n", + "!pip install trafilatura" + ] + }, + { + "cell_type": "code", + "execution_count": 70, + "metadata": { + "id": "nYsEFBwnOIb8" + }, + "outputs": [], + "source": [ + "from haystack import Document, component\n", + "from haystack.components.builders import PromptBuilder\n", + "from haystack.components.converters import HTMLToDocument\n", + "from haystack.components.fetchers import LinkContentFetcher\n", + "from haystack.components.generators import OpenAIGenerator\n", + "from haystack.components.generators.openai_utils import _convert_message_to_openai_format\n", + "from haystack.components.preprocessors import DocumentSplitter\n", + "from haystack.dataclasses import ChatMessage, StreamingChunk\n", + "from haystack.document_stores.in_memory import InMemoryDocumentStore\n", + "\n", + "from openai import OpenAI, Stream\n", + "from openai.types.chat import ChatCompletion, ChatCompletionChunk\n", + "from typing import List, Any, Dict, Optional, Callable, Union\n", + "from pydantic import BaseModel" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "0m05uf0c-ahX" + }, + "source": [ + "## 🧪 Experimental Addition to the OpenAIGenerator for Structured Output Support\n", + "\n", + "> 🚀 This is the same extension to the `OpenAIGenerator` that was used in the [Advanced RAG: Query Decomposition and Reasoning](https://haystack.deepset.ai/cookbook/query_decomposition) example\n", + "\n", + "Let's extend the `OpenAIGeneraotor` to be able to make use of the [strctured output option by OpenAI](https://platform.openai.com/docs/guides/structured-outputs/introduction). Below, we extend the class to call `self.client.beta.chat.completions.parse` if the user has provides a `respose_format` in `generation_kwargs`. This will allow us to provifde a Pydantic Model to the gnerator and request our generator to respond with structured outputs that adhere to this Pydantic schema." + ] + }, + { + "cell_type": "code", + "execution_count": 71, + "metadata": { + "id": "6ihGheQx-gSk" + }, + "outputs": [], + "source": [ + "class OpenAIGenerator(OpenAIGenerator):\n", + " def __init__(self, **kwargs):\n", + " super().__init__(**kwargs)\n", + "\n", + " @component.output_types(replies=List[str], meta=List[Dict[str, Any]], structured_reply=BaseModel)\n", + " def run(self, prompt: str, streaming_callback: Optional[Callable[[StreamingChunk], None]] = None, generation_kwargs: Optional[Dict[str, Any]] = None,):\n", + " generation_kwargs = {**self.generation_kwargs, **(generation_kwargs or {})}\n", + " if \"response_format\" in generation_kwargs.keys():\n", + " message = ChatMessage.from_user(prompt)\n", + " if self.system_prompt:\n", + " messages = [ChatMessage.from_system(self.system_prompt), message]\n", + " else:\n", + " messages = [message]\n", + "\n", + " streaming_callback = streaming_callback or self.streaming_callback\n", + " openai_formatted_messages = [_convert_message_to_openai_format(message) for message in messages]\n", + " completion: Union[Stream[ChatCompletionChunk], ChatCompletion] = self.client.beta.chat.completions.parse(\n", + " model=self.model,\n", + " messages=openai_formatted_messages,\n", + " **generation_kwargs)\n", + " completions = [self._build_structured_message(completion, choice) for choice in completion.choices]\n", + " for response in completions:\n", + " self._check_finish_reason(response)\n", + "\n", + " return {\n", + " \"replies\": [message.content for message in completions],\n", + " \"meta\": [message.meta for message in completions],\n", + " \"structured_reply\": completions[0].content\n", + " }\n", + " else:\n", + " return super().run(prompt, streaming_callback, generation_kwargs)\n", + "\n", + " def _build_structured_message(self, completion: Any, choice: Any) -> ChatMessage:\n", + " chat_message = ChatMessage.from_assistant(choice.message.parsed or \"\")\n", + " chat_message.meta.update(\n", + " {\n", + " \"model\": completion.model,\n", + " \"index\": choice.index,\n", + " \"finish_reason\": choice.finish_reason,\n", + " \"usage\": dict(completion.usage),\n", + " }\n", + " )\n", + " return chat_message" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "7JGz6wd6mZ2M", + "outputId": "fd3c7e1d-95c4-464c-bf4c-5fa4a509f41c" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "OpenAI API Key:··········\n" + ] + } + ], + "source": [ + "import os\n", + "from getpass import getpass\n", + "\n", + "if \"OPENAI_API_KEY\" not in os.environ:\n", + " os.environ[\"OPENAI_API_KEY\"] = getpass(\"OpenAI API Key:\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "TJB0zJayZwBx" + }, + "source": [ + "## Custom `MetadataEnricher`\n", + "\n", + "We create a custom Haystack component that is able ti accept `metadata_model` and `prompt`. If no prompt is provided, it usees the `DEFAULT_PROMPT`.\n", + "\n", + "This component returns `documents` enriched with the requested metadata fileds." + ] + }, + { + "cell_type": "code", + "execution_count": 72, + "metadata": { + "id": "TXHDmsunFI5V" + }, + "outputs": [], + "source": [ + "from haystack import component, Pipeline, Document\n", + "from haystack.components.builders.prompt_builder import PromptBuilder\n", + "\n", + "DEFAULT_PROMPT = \"\"\"\n", + "Given the contents of the documents, extract the requested metadata.\n", + "The requested metadata is {{ metadata_model }}\n", + "Document:\n", + "{{document}}\n", + "Metadata:\n", + "\"\"\"\n", + "@component\n", + "class MetadataEnricher:\n", + "\n", + " def __init__(self, metadata_model: BaseModel, prompt:str = DEFAULT_PROMPT):\n", + " self.metadata_model = metadata_model\n", + " self.metadata_prompt = prompt\n", + "\n", + " builder = PromptBuilder(self.metadata_prompt)\n", + " llm = OpenAIGenerator(generation_kwargs={\"response_format\": metadata_model})\n", + " self.pipeline = Pipeline()\n", + " self.pipeline.add_component(name=\"builder\", instance=builder)\n", + " self.pipeline.add_component(name=\"llm\", instance=llm)\n", + " self.pipeline.connect(\"builder\", \"llm\")\n", + "\n", + " @component.output_types(documents=List[Document])\n", + " def run(self, documents: List[Document]):\n", + " documents_with_meta = []\n", + " for document in documents:\n", + " result = self.pipeline.run({'builder': {'document': document.content, 'metadata_model': self.metadata_model}})\n", + " metadata = result['llm']['structured_reply']\n", + " document.meta.update(metadata.dict())\n", + " documents_with_meta.append(document)\n", + " return {\"documents\": documents_with_meta}" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "9Ht7fsHPl_jI" + }, + "source": [ + "## Define Metadata Fields as a Pydantic Model\n", + "\n", + "For automatic metadata enrichment, we want to be able to provide a structure describing what fields we want to extract, as well as what types they should be.\n", + "\n", + "Below, I have defined a `Metadata` model, with 4 fields.\n", + "\n", + "> 💡 **Note:** In some cases, it might make sense to make each field optional, or provide default values." + ] + }, + { + "cell_type": "code", + "execution_count": 73, + "metadata": { + "id": "qlKb8fwln5ha" + }, + "outputs": [], + "source": [ + "class Metadata(BaseModel):\n", + " company: str\n", + " year: int\n", + " funding_value: int\n", + " funding_currency: str" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "bjNyCykWapxU" + }, + "source": [ + "Next, we initialize a `MetadataEnricher` and provide `Metadata` as the `metadata_model` we want to abide by." + ] + }, + { + "cell_type": "code", + "execution_count": 79, + "metadata": { + "id": "7kJflm1jGdNg" + }, + "outputs": [], + "source": [ + "enricher = MetadataEnricher(metadata_model=Metadata)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "9ZTQrpc7a0gF" + }, + "source": [ + "## Build an Automated Metadata Enrichment Pipeline\n", + "\n", + "Now that we have our `enricher`, we can use it in a pipeline. Below is an example of a pipeline that fetches the contents of some URLs (in this case, urls that contain information about funding announcements). The pipeline then adds the requested metadata fields to each `Document`'s `meta` field 👇" + ] + }, + { + "cell_type": "code", + "execution_count": 80, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "WiBYhEzhOqTn", + "outputId": "ea8760e4-abf3-4867-c0bc-e3b645522671" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "{'enricher': {'documents': [Document(id=5844517120556b13f92430ea8af9837714ede1b351580c43c2ddce9b646cb6cb, content: 'Deepset, a platform for building enterprise apps powered by large language models akin to ChatGPT, t...', meta: {'content_type': 'text/html', 'url': 'https://techcrunch.com/2023/08/09/deepset-secures-30m-to-expand-its-llm-focused-mlops-offerings/', 'company': 'Deepset', 'year': 2023, 'funding_value': 30000000, 'funding_currency': 'USD'}),\n", + " Document(id=8cdcb63a4e006b1cac902ebc2e012cd95156d188777e0d0c8bd407a92f4491c7, content: 'Arize AI Raises $38 Million Series B To Scale Machine Learning Observability Platform\n", + " As companies t...', meta: {'content_type': 'text/html', 'url': 'https://www.prnewswire.com/news-releases/arize-ai-raises-38-million-series-b-to-scale-machine-learning-observability-platform-301620603.html', 'company': 'Arize AI', 'year': 2022, 'funding_value': 38000000, 'funding_currency': 'USD'})]}}" + ] + }, + "execution_count": 80, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pipeline = Pipeline()\n", + "pipeline.add_component(\"fetcher\", LinkContentFetcher())\n", + "pipeline.add_component(\"converter\", HTMLToDocument())\n", + "pipeline.add_component(\"enricher\", enricher)\n", + "\n", + "\n", + "pipeline.connect(\"fetcher\", \"converter\")\n", + "pipeline.connect(\"converter.documents\", \"enricher.documents\")\n", + "\n", + "pipeline.run({\"fetcher\": {\"urls\": ['https://techcrunch.com/2023/08/09/deepset-secures-30m-to-expand-its-llm-focused-mlops-offerings/',\n", + " 'https://www.prnewswire.com/news-releases/arize-ai-raises-38-million-series-b-to-scale-machine-learning-observability-platform-301620603.html']}})" + ] + }, + { + "cell_type": "code", + "execution_count": 81, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 955 + }, + "id": "uIzoahGZREwr", + "outputId": "04125f08-95d1-4791-dafd-fcaff93a180e" + }, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "pipeline.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Ev0SiDqnbOCa" + }, + "source": [ + "## Extra: Metadata Inheritance\n", + "\n", + "This is just an extra step to show how metadata that belongs to a document is inherited by the document chunks if you use a component such as the `DocumentSplitter`." + ] + }, + { + "cell_type": "code", + "execution_count": 82, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "gAPfuWMMRGnP", + "outputId": "c2c0d04f-8c0e-4a38-f319-3e2e343e0306" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "\n", + "🚅 Components\n", + " - fetcher: LinkContentFetcher\n", + " - converter: HTMLToDocument\n", + " - enricher: MetadataEnricher\n", + " - splitter: DocumentSplitter\n", + "🛤️ Connections\n", + " - fetcher.streams -> converter.sources (List[ByteStream])\n", + " - converter.documents -> enricher.documents (List[Document])\n", + " - enricher.documents -> splitter.documents (List[Document])" + ] + }, + "execution_count": 82, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pipeline.add_component(\"splitter\", DocumentSplitter())\n", + "\n", + "pipeline.connect(\"enricher\", \"splitter\")" + ] + }, + { + "cell_type": "code", + "execution_count": 83, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "BRgr9jy0Ro8Y", + "outputId": "0255a435-54e2-45d3-9edb-aea5b1f81758" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "{'splitter': {'documents': [Document(id=9611aa2bdb658163d8f6964220052065936fcd036dd24743d1b34ce79d25bc5a, content: 'Deepset, a platform for building enterprise apps powered by large language models akin to ChatGPT, t...', meta: {'content_type': 'text/html', 'url': 'https://techcrunch.com/2023/08/09/deepset-secures-30m-to-expand-its-llm-focused-mlops-offerings/', 'company': 'Deepset', 'year': 2023, 'funding_value': 30000000, 'funding_currency': 'USD', 'source_id': '5844517120556b13f92430ea8af9837714ede1b351580c43c2ddce9b646cb6cb', 'page_number': 1, 'split_id': 0, 'split_idx_start': 0}),\n", + " Document(id=6bffbcf9f1cd1a3940628d1450c9ba9a8c9a092136896d295b35af3175caffbf, content: 'unfortunate state of affairs is likely contributing to challenges around AI development within the e...', meta: {'content_type': 'text/html', 'url': 'https://techcrunch.com/2023/08/09/deepset-secures-30m-to-expand-its-llm-focused-mlops-offerings/', 'company': 'Deepset', 'year': 2023, 'funding_value': 30000000, 'funding_currency': 'USD', 'source_id': '5844517120556b13f92430ea8af9837714ede1b351580c43c2ddce9b646cb6cb', 'page_number': 1, 'split_id': 1, 'split_idx_start': 1256}),\n", + " Document(id=da372f9bc2292f487f0aad372053a531744d9549a46f8ac197c216abaa4d99d0, content: 'to end users, and perform analyses of the LLMs’ accuracy while continuously monitoring their perform...', meta: {'content_type': 'text/html', 'url': 'https://techcrunch.com/2023/08/09/deepset-secures-30m-to-expand-its-llm-focused-mlops-offerings/', 'company': 'Deepset', 'year': 2023, 'funding_value': 30000000, 'funding_currency': 'USD', 'source_id': '5844517120556b13f92430ea8af9837714ede1b351580c43c2ddce9b646cb6cb', 'page_number': 1, 'split_id': 2, 'split_idx_start': 2609}),\n", + " Document(id=f316cd275e8bc763de41d128dbdbd81e1baad2693b0102f6951c4f46aa8f6048, content: 'predicts that the sector for MLOps will reach $23.1 billion by 2031, up from around $1 billion in 20...', meta: {'content_type': 'text/html', 'url': 'https://techcrunch.com/2023/08/09/deepset-secures-30m-to-expand-its-llm-focused-mlops-offerings/', 'company': 'Deepset', 'year': 2023, 'funding_value': 30000000, 'funding_currency': 'USD', 'source_id': '5844517120556b13f92430ea8af9837714ede1b351580c43c2ddce9b646cb6cb', 'page_number': 1, 'split_id': 3, 'split_idx_start': 3997}),\n", + " Document(id=c7ff4e0d7af8aaa16f3195cb1f9096bb1cf8e7d985190fa6746c278b1d8457e8, content: 'Arize AI Raises $38 Million Series B To Scale Machine Learning Observability Platform\n", + " As companies t...', meta: {'content_type': 'text/html', 'url': 'https://www.prnewswire.com/news-releases/arize-ai-raises-38-million-series-b-to-scale-machine-learning-observability-platform-301620603.html', 'company': 'Arize AI', 'year': 2022, 'funding_value': 38000000, 'funding_currency': 'USD', 'source_id': '8cdcb63a4e006b1cac902ebc2e012cd95156d188777e0d0c8bd407a92f4491c7', 'page_number': 1, 'split_id': 0, 'split_idx_start': 0}),\n", + " Document(id=646f4d43ee20fcbe35c97bd04e8fd6edd4ad5c9af63fe4a63267f5df1807254f, content: 'by humans.\n", + " Launched in 2020, Arize's ML observability platform is already counted on by a growing li...', meta: {'content_type': 'text/html', 'url': 'https://www.prnewswire.com/news-releases/arize-ai-raises-38-million-series-b-to-scale-machine-learning-observability-platform-301620603.html', 'company': 'Arize AI', 'year': 2022, 'funding_value': 38000000, 'funding_currency': 'USD', 'source_id': '8cdcb63a4e006b1cac902ebc2e012cd95156d188777e0d0c8bd407a92f4491c7', 'page_number': 1, 'split_id': 1, 'split_idx_start': 1360}),\n", + " Document(id=09382ae0ab9adbd7860199b0d86e8ca044787eda860dfb181e3b243c6584a427, content: 'what happened, and improve overall model performance,\" says Morgan Gerlak, Partner at TCV. \"Like oth...', meta: {'content_type': 'text/html', 'url': 'https://www.prnewswire.com/news-releases/arize-ai-raises-38-million-series-b-to-scale-machine-learning-observability-platform-301620603.html', 'company': 'Arize AI', 'year': 2022, 'funding_value': 38000000, 'funding_currency': 'USD', 'source_id': '8cdcb63a4e006b1cac902ebc2e012cd95156d188777e0d0c8bd407a92f4491c7', 'page_number': 1, 'split_id': 2, 'split_idx_start': 2697}),\n", + " Document(id=94497826791e38f15016a2360d7e3eea5e242770f51dd11be69fb21210e89c9a, content: 'you are going to be left behind,\" notes Brett Wilson, Co-Founder and General Partner at Swift Ventur...', meta: {'content_type': 'text/html', 'url': 'https://www.prnewswire.com/news-releases/arize-ai-raises-38-million-series-b-to-scale-machine-learning-observability-platform-301620603.html', 'company': 'Arize AI', 'year': 2022, 'funding_value': 38000000, 'funding_currency': 'USD', 'source_id': '8cdcb63a4e006b1cac902ebc2e012cd95156d188777e0d0c8bd407a92f4491c7', 'page_number': 1, 'split_id': 3, 'split_idx_start': 3967})]}}" + ] + }, + "execution_count": 83, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pipeline.run({\"fetcher\": {\"urls\": ['https://techcrunch.com/2023/08/09/deepset-secures-30m-to-expand-its-llm-focused-mlops-offerings/',\n", + " 'https://www.prnewswire.com/news-releases/arize-ai-raises-38-million-series-b-to-scale-machine-learning-observability-platform-301620603.html']}})" + ] + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + }, + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +}