From 2f06069760f83901d6eefe16db547fdb593e2616 Mon Sep 17 00:00:00 2001 From: Julia Gomes Date: Fri, 26 Jul 2024 13:58:32 -0700 Subject: [PATCH 1/8] Add tutorial --- ...ize_hallucination_evaluator_with_rag.ipynb | 620 ++++++++++++++++++ 1 file changed, 620 insertions(+) create mode 100644 tutorial_arize_hallucination_evaluator_with_rag.ipynb diff --git a/tutorial_arize_hallucination_evaluator_with_rag.ipynb b/tutorial_arize_hallucination_evaluator_with_rag.ipynb new file mode 100644 index 0000000..ac38d88 --- /dev/null +++ b/tutorial_arize_hallucination_evaluator_with_rag.ipynb @@ -0,0 +1,620 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "vAuY_sNgr4OQ" + }, + "source": [ + "# Install Dependencies\n", + "Various installations are required for OTL, LlamaIndex and Open AI." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "HY97cJFAk7Sc", + "outputId": "c3fb7ff5-988a-4e16-d7d3-732b043253df" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "🔑 Enter your OpenAI API key: ··········\n" + ] + } + ], + "source": [ + "!pip install -qq 'openinference-instrumentation-llama-index>=0.1.6' 'openinference-instrumentation-llama-index>=0.1.6' llama-index-llms-openai opentelemetry-exporter-otlp llama-index>=0.10.3 \"llama-index-callbacks-arize-phoenix>=0.1.2\" arize-otel\n", + "\n", + "import os\n", + "from getpass import getpass\n", + "\n", + "openai_api_key = getpass(\"🔑 Enter your OpenAI API key: \")\n", + "os.environ[\"OPENAI_API_KEY\"] = openai_api_key" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "OZvseSFHsD7M" + }, + "source": [ + "# Initialize Arize Phoenix\n", + "Set up OTL tracer for the `LlamaIndexInstrumentor`." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "VCDUdapRrqpd", + "outputId": "53b29ff8-34da-4bba-c730-ce06d3e13511" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "🔑 Enter your Arize space key in the space settings page of the Arize UI: ··········\n", + "🔑 Enter your Arize API key in the space settings page of the Arize UI: ··········\n" + ] + } + ], + "source": [ + "from openinference.instrumentation.llama_index import LlamaIndexInstrumentor\n", + "from arize_otel import register_otel, Endpoints\n", + "\n", + "# Setup OTEL via our convenience function\n", + "register_otel(\n", + " endpoints = Endpoints.ARIZE,\n", + " space_key = getpass(\"🔑 Enter your Arize space key in the space settings page of the Arize UI: \"),\n", + " api_key = getpass(\"🔑 Enter your Arize API key in the space settings page of the Arize UI: \"),\n", + " model_id = \"test-guard-july10-6:07pm\", # name this to whatever you would like\n", + ")\n", + "LlamaIndexInstrumentor().instrument()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "0z-3-Reas5TW" + }, + "source": [ + "# Instrument Guardrails AI\n", + "Install and instrument Guardrails AI. Import `ArizeDatasetEmbeddings` Guard." + ] + }, + { + "cell_type": "code", + "source": [ + "!pip install -qq guardrails-ai litellm" + ], + "metadata": { + "collapsed": true, + "id": "Q8nwDMRXfmjy" + }, + "execution_count": 5, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "import os\n", + "from typing import Any, Callable, Dict, Optional, Type\n", + "import logging\n", + "from abc import ABC, abstractmethod\n", + "\n", + "from guardrails.validator_base import (\n", + " FailResult,\n", + " PassResult,\n", + " ValidationResult,\n", + " Validator,\n", + " register_validator,\n", + ")\n", + "from guardrails.stores.context import get_call_kwarg\n", + "from litellm import completion, get_llm_provider\n", + "\n", + "logger = logging.getLogger(__name__)\n", + "\n", + "\n", + "class ArizeRagEvalPromptBase(ABC):\n", + " def __init__(self, prompt_name, **kwargs) -> None:\n", + " self.prompt_name = prompt_name\n", + "\n", + " @abstractmethod\n", + " def generate_prompt(self, user_input_message: str, reference_text: str, llm_response: str) -> str:\n", + " pass\n", + "\n", + "\n", + "class HallucinationPrompt(ArizeRagEvalPromptBase):\n", + " def generate_prompt(self, user_input_message: str, reference_text: str, llm_response: str) -> str:\n", + " return f\"\"\"\n", + " In this task, you will be presented with a query, a reference text and an answer. The answer is\n", + " generated to the question based on the reference text. The answer may contain false information. You\n", + " must use the reference text to determine if the answer to the question contains false information,\n", + " if the answer is a hallucination of facts. Your objective is to determine whether the answer text\n", + " contains factual information and is not a hallucination. A 'hallucination' refers to\n", + " an answer that is not based on the reference text or assumes information that is not available in\n", + " the reference text. Your response should be a single word: either \"factual\" or \"hallucinated\", and\n", + " it should not include any other text or characters. \"hallucinated\" indicates that the answer\n", + " provides factually inaccurate information to the query based on the reference text. \"factual\"\n", + " indicates that the answer to the question is correct relative to the reference text, and does not\n", + " contain made up information. Please read the query and reference text carefully before determining\n", + " your response.\n", + "\n", + " [BEGIN DATA]\n", + " ************\n", + " [Query]: {user_input_message}\n", + " ************\n", + " [Reference text]: {reference_text}\n", + " ************\n", + " [Answer]: {llm_response}\n", + " ************\n", + " [END DATA]\n", + "\n", + " Is the answer above factual or hallucinated based on the query and reference text?\n", + " \"\"\"\n", + "\n", + "\n", + "@register_validator(name=\"arize/llm_rag_evaluator\", data_type=\"string\")\n", + "class LlmRagEvaluator(Validator):\n", + " \"\"\"This class validates an output generated by a LiteLLM (LLM) model by prompting another LLM model to evaluate the output.\n", + "\n", + " **Key Properties**\n", + "\n", + " | Property | Description |\n", + " | ----------------------------- | --------------------------------- |\n", + " | Name for `format` attribute | `arize/relevancy_evaluator` |\n", + " | Supported data types | `string` |\n", + " | Programmatic fix | N/A |\n", + "\n", + " Args:\n", + " llm_callable (str, optional): The name of the LiteLLM model to use for validation. Defaults to \"gpt-3.5-turbo\".\n", + " on_fail (Callable, optional): A function to be called when validation fails. Defaults to None.\n", + " \"\"\"\n", + "\n", + " def __init__(\n", + " self,\n", + " eval_llm_prompt_generator: Type[ArizeRagEvalPromptBase],\n", + " llm_evaluator_fail_response: str,\n", + " llm_evaluator_pass_response: str,\n", + " llm_callable: str,\n", + " on_fail: Optional[Callable] = \"noop\",\n", + " **kwargs,\n", + " ):\n", + " super().__init__(\n", + " on_fail,\n", + " eval_llm_prompt_generator=eval_llm_prompt_generator,\n", + " llm_evaluator_fail_response=llm_evaluator_fail_response,\n", + " llm_evaluator_pass_response=llm_evaluator_pass_response,\n", + " llm_callable=llm_callable,\n", + " **kwargs)\n", + " self._llm_evaluator_prompt_generator = eval_llm_prompt_generator\n", + " self._llm_callable = llm_callable\n", + " self._fail_response = llm_evaluator_fail_response\n", + " self._pass_response = llm_evaluator_pass_response\n", + "\n", + " def get_llm_response(self, prompt: str) -> str:\n", + " \"\"\"Gets the response from the LLM.\n", + "\n", + " Args:\n", + " prompt (str): The prompt to send to the LLM.\n", + "\n", + " Returns:\n", + " str: The response from the LLM.\n", + " \"\"\"\n", + " # 0. Create messages\n", + " messages = [{\"content\": prompt, \"role\": \"user\"}]\n", + "\n", + " # 0b. Setup auth kwargs if the model is from OpenAI\n", + " kwargs = {}\n", + " _model, provider, *_rest = get_llm_provider(self._llm_callable)\n", + " if provider == \"openai\":\n", + " kwargs[\"api_key\"] = get_call_kwarg(\"api_key\") or os.environ.get(\"OPENAI_API_KEY\")\n", + "\n", + " # 1. Get LLM response\n", + " # Strip whitespace and convert to lowercase\n", + " try:\n", + " response = completion(model=self._llm_callable, messages=messages, **kwargs)\n", + " response = response.choices[0].message.content # type: ignore\n", + " response = response.strip().lower()\n", + " except Exception as e:\n", + " raise RuntimeError(f\"Error getting response from the LLM: {e}\") from e\n", + "\n", + " # 3. Return the response\n", + " return response\n", + "\n", + " def validate(self, value: Any, metadata: Dict) -> ValidationResult:\n", + " \"\"\"\n", + " Validates is based on the relevance of the reference text to the original question.\n", + "\n", + " Args:\n", + " value (Any): The value to validate. It must contain 'original_prompt' and 'reference_text' keys.\n", + " metadata (Dict): The metadata for the validation.\n", + " user_message: Required key. User query passed into RAG LLM.\n", + " context: Required key. Context used by RAG LLM.\n", + " llm_response: Optional key. By default, the gaurded LLM will make the RAG LLM call, which corresponds\n", + " to the `value`. If the user calls the guard with on=\"prompt\", then the original RAG LLM response\n", + " needs to be passed into the guard as metadata for the LLM judge to evaluate.\n", + "\n", + " Returns:\n", + " ValidationResult: The result of the validation. It can be a PassResult if the reference\n", + " text is relevant to the original question, or a FailResult otherwise.\n", + " \"\"\"\n", + " # 1. Get the question and arg from the value\n", + " user_input_message = metadata.get(\"user_message\")\n", + " if user_input_message is None:\n", + " raise RuntimeError(\n", + " \"original_prompt missing from value. \"\n", + " \"Please provide the original prompt.\"\n", + " )\n", + "\n", + " reference_text = metadata.get(\"context\")\n", + " if reference_text is None:\n", + " raise RuntimeError(\n", + " \"'reference_text' missing from value. \"\n", + " \"Please provide the reference text.\"\n", + " )\n", + "\n", + " # Option to override guarded LLM call with response passed in through metadata\n", + " if metadata.get(\"llm_response\") is not None:\n", + " value = metadata.get(\"llm_response\")\n", + "\n", + " # 2. Setup the prompt\n", + " prompt = self._llm_evaluator_prompt_generator.generate_prompt(user_input_message=user_input_message, reference_text=reference_text, llm_response=value)\n", + " print(f\"\\nevaluator prompt: {prompt}\")\n", + "\n", + " # 3. Get the LLM response\n", + " llm_response = self.get_llm_response(prompt)\n", + " print(f\"\\nllm evaluator response: {llm_response}\")\n", + "\n", + " # 4. Check the LLM response and return the result\n", + " if llm_response == self._fail_response:\n", + " print(f\"\\nVALIDATION FAILED\")\n", + " return FailResult(error_message=f\"The LLM says {self._fail_response}. The validation failed.\")\n", + "\n", + " if llm_response == self._pass_response:\n", + " print(f\"\\nVALIDATION PASSED\")\n", + " return PassResult()\n", + "\n", + " print(f\"\\nVALIDATION FAILED\")\n", + " return FailResult(\n", + " error_message=\"The LLM returned an invalid answer. Failing the validation...\"\n", + " )\n" + ], + "metadata": { + "id": "WIdK2paCf828" + }, + "execution_count": 20, + "outputs": [] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": { + "id": "E5GV4hwGUxxB" + }, + "outputs": [], + "source": [ + "from guardrails import Guard\n", + "\n", + "guard = Guard.from_string(\n", + " validators=[\n", + " LlmRagEvaluator(\n", + " eval_llm_prompt_generator=HallucinationPrompt(prompt_name=\"hallucination_judge_llm\"),\n", + " llm_evaluator_fail_response=\"hallucinated\",\n", + " llm_evaluator_pass_response=\"factual\",\n", + " llm_callable=\"gpt-4o-mini\",\n", + " on_fail=\"exception\",\n", + " on=\"prompt\")\n", + " ],\n", + " )\n", + "guard._disable_tracer = True" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": { + "id": "MpwPFAtDsrcf" + }, + "outputs": [], + "source": [ + "import openai\n", + "from typing import Optional, List, Mapping, Any\n", + "\n", + "from llama_index.core import SimpleDirectoryReader, SummaryIndex\n", + "from llama_index.core.callbacks import CallbackManager\n", + "from llama_index.core.llms import (\n", + " CustomLLM,\n", + " CompletionResponse,\n", + " CompletionResponseGen,\n", + " LLMMetadata,\n", + ")\n", + "from llama_index.core.llms.callbacks import llm_completion_callback\n", + "from llama_index.core import Settings\n", + "\n", + "from llama_index.llms.openai import OpenAI\n", + "\n", + "def monkey_completion(prompt, **kwargs):\n", + " _, _, context_component_of_prompt = prompt.partition(\"Context information is below.\")\n", + " _, _, query_component_of_prompt = prompt.partition(\"Query: \")\n", + " return guard(\n", + " llm_api=openai.chat.completions.create,\n", + " prompt=prompt,\n", + " model=\"gpt-3.5-turbo\",\n", + " max_tokens=1024,\n", + " temperature=0.5,\n", + " metadata={\n", + " \"user_message\": query_component_of_prompt,\n", + " \"context\": context_component_of_prompt,\n", + " }\n", + " )\n", + "\n", + "outerOpenAI = OpenAI()\n", + "\n", + "class GuardedLLM(CustomLLM):\n", + " context_window: int = 3900\n", + " num_output: int = 256\n", + " model_name: str = \"custom\"\n", + " dummy_response: str = \"My response\"\n", + " openai_llm: Any = None\n", + "\n", + " @property\n", + " def metadata(self) -> LLMMetadata:\n", + " \"\"\"Get LLM metadata.\"\"\"\n", + " return outerOpenAI.metadata\n", + "\n", + " @llm_completion_callback()\n", + " def complete(self, prompt: str, **kwargs: Any) -> CompletionResponse:\n", + " validated_response = monkey_completion(prompt, **kwargs)\n", + " return CompletionResponse(text=validated_response.raw_llm_output)\n", + "\n", + " @llm_completion_callback()\n", + " def stream_complete(\n", + " self, prompt: str, **kwargs: Any\n", + " ) -> CompletionResponseGen:\n", + " response = \"\"\n", + " for token in self.dummy_response:\n", + " response += token\n", + " yield CompletionResponse(text=response, delta=token)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "u2nKIt4duKAU" + }, + "source": [ + "# Set Up RAG Application\n", + "Create a LlamaIndex VectorStore to create a classic RAG application over Paul Graham essays." + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "p-g6O8tvsxKd", + "outputId": "459d1b27-d40c-4bd6-f302-25fc7644770b" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + " % Total % Received % Xferd Average Speed Time Time Time Current\n", + " Dload Upload Total Spent Left Speed\n", + "\r 0 0 0 0 0 0 0 0 --:--:-- --:--:-- --:--:-- 0\r100 75042 100 75042 0 0 725k 0 --:--:-- --:--:-- --:--:-- 732k\n" + ] + } + ], + "source": [ + "!mkdir -p 'data/paul_graham/'\n", + "!curl 'https://raw.githubusercontent.com/run-llama/llama_index/main/docs/docs/examples/data/paul_graham/paul_graham_essay.txt' > 'data/paul_graham/paul_graham_essay.txt'\n", + "import logging\n", + "import sys\n", + "\n", + "logging.basicConfig(stream=sys.stdout, level=logging.INFO)\n", + "logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))\n", + "\n", + "from llama_index.core import VectorStoreIndex, SimpleDirectoryReader\n", + "# load documents\n", + "documents = SimpleDirectoryReader(\"./data/paul_graham/\").load_data()\n", + "index = VectorStoreIndex.from_documents(documents, chunk_size=512)" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 1000 + }, + "id": "d1Rzk87ZwDo8", + "outputId": "b3889ab9-cb1c-4640-b06f-7f37be79b5cc" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "\n", + "evaluator prompt: \n", + " In this task, you will be presented with a query, a reference text and an answer. The answer is\n", + " generated to the question based on the reference text. The answer may contain false information. You\n", + " must use the reference text to determine if the answer to the question contains false information,\n", + " if the answer is a hallucination of facts. Your objective is to determine whether the answer text\n", + " contains factual information and is not a hallucination. A 'hallucination' refers to\n", + " an answer that is not based on the reference text or assumes information that is not available in\n", + " the reference text. Your response should be a single word: either \"factual\" or \"hallucinated\", and\n", + " it should not include any other text or characters. \"hallucinated\" indicates that the answer\n", + " provides factually inaccurate information to the query based on the reference text. \"factual\"\n", + " indicates that the answer to the question is correct relative to the reference text, and does not\n", + " contain made up information. Please read the query and reference text carefully before determining\n", + " your response.\n", + "\n", + " [BEGIN DATA]\n", + " ************\n", + " [Query]: What are the 3 ingredients to great work?\n", + "Answer: \n", + "assistant: \n", + " ************\n", + " [Reference text]: \n", + "---------------------\n", + "file_path: /content/data/paul_graham/paul_graham_essay.txt\n", + "\n", + "It's not that unprestigious types of work are good per se. But when you find yourself drawn to some kind of work despite its current lack of prestige, it's a sign both that there's something real to be discovered there, and that you have the right kind of motives. Impure motives are a big danger for the ambitious. If anything is going to lead you astray, it will be the desire to impress people. So while working on things that aren't prestigious doesn't guarantee you're on the right track, it at least guarantees you're not on the most common type of wrong one.\n", + "\n", + "Over the next several years I wrote lots of essays about all kinds of different topics. O'Reilly reprinted a collection of them as a book, called Hackers & Painters after one of the essays in it. I also worked on spam filters, and did some more painting. I used to have dinners for a group of friends every thursday night, which taught me how to cook for groups. And I bought another building in Cambridge, a former candy factory (and later, twas said, porn studio), to use as an office.\n", + "\n", + "One night in October 2003 there was a big party at my house. It was a clever idea of my friend Maria Daniels, who was one of the thursday diners. Three separate hosts would all invite their friends to one party. So for every guest, two thirds of the other guests would be people they didn't know but would probably like. One of the guests was someone I didn't know but would turn out to like a lot: a woman called Jessica Livingston. A couple days later I asked her out.\n", + "\n", + "Jessica was in charge of marketing at a Boston investment bank. This bank thought it understood startups, but over the next year, as she met friends of mine from the startup world, she was surprised how different reality was. And how colorful their stories were. So she decided to compile a book of interviews with startup founders.\n", + "\n", + "When the bank had financial problems and she had to fire half her staff, she started looking for a new job. In early 2005 she interviewed for a marketing job at a Boston VC firm. It took them weeks to make up their minds, and during this time I started telling her about all the things that needed to be fixed about venture capital. They should make a larger number of smaller investments instead of a handful of giant ones, they should be funding younger, more technical founders instead of MBAs, they should let the founders remain as CEO, and so on.\n", + "\n", + "One of my tricks for writing essays had always been to give talks. The prospect of having to stand up in front of a group of people and tell them something that won't waste their time is a great spur to the imagination. When the Harvard Computer Society, the undergrad computer club, asked me to give a talk, I decided I would tell them how to start a startup. Maybe they'd be able to avoid the worst of the mistakes we'd made.\n", + "\n", + "So I gave this talk, in the course of which I told them that the best sources of seed funding were successful startup founders, because then they'd be sources of advice too. Whereupon it seemed they were all looking expectantly at me. Horrified at the prospect of having my inbox flooded by business plans (if I'd only known), I blurted out \"But not me!\" and went on with the talk. But afterward it occurred to me that I should really stop procrastinating about angel investing. I'd been meaning to since Yahoo bought us, and now it was 7 years later and I still hadn't done one angel investment.\n", + "\n", + "Meanwhile I had been scheming with Robert and Trevor about projects we could work on together. I missed working with them, and it seemed like there had to be something we could collaborate on.\n", + "\n", + "As Jessica and I were walking home from dinner on March 11, at the corner of Garden and Walker streets, these three threads converged. Screw the VCs who were taking so long to make up their minds. We'd start our own investment firm and actually implement the ideas we'd been talking about. I'd fund it, and Jessica could quit her job and work for it, and we'd get Robert and Trevor as partners too. [13]\n", + "\n", + "Once again, ignorance worked in our favor. We had no idea how to be angel investors, and in Boston in 2005 there were no Ron Conways to learn from. So we just made what seemed like the obvious choices, and some of the things we did turned out to be novel.\n", + "\n", + "There are multiple components to Y Combinator, and we didn't figure them all out at once. The part we got first was to be an angel firm.\n", + "\n", + "file_path: /content/data/paul_graham/paul_graham_essay.txt\n", + "\n", + "Much to my surprise, the time I spent working on this stuff was not wasted after all. After we started Y Combinator, I would often encounter startups working on parts of this new architecture, and it was very useful to have spent so much time thinking about it and even trying to write some of it.\n", + "\n", + "The subset I would build as an open source project was the new Lisp, whose parentheses I now wouldn't even have to hide. A lot of Lisp hackers dream of building a new Lisp, partly because one of the distinctive features of the language is that it has dialects, and partly, I think, because we have in our minds a Platonic form of Lisp that all existing dialects fall short of. I certainly did. So at the end of the summer Dan and I switched to working on this new dialect of Lisp, which I called Arc, in a house I bought in Cambridge.\n", + "\n", + "The following spring, lightning struck. I was invited to give a talk at a Lisp conference, so I gave one about how we'd used Lisp at Viaweb. Afterward I put a postscript file of this talk online, on paulgraham.com, which I'd created years before using Viaweb but had never used for anything. In one day it got 30,000 page views. What on earth had happened? The referring urls showed that someone had posted it on Slashdot. [10]\n", + "\n", + "Wow, I thought, there's an audience. If I write something and put it on the web, anyone can read it. That may seem obvious now, but it was surprising then. In the print era there was a narrow channel to readers, guarded by fierce monsters known as editors. The only way to get an audience for anything you wrote was to get it published as a book, or in a newspaper or magazine. Now anyone could publish anything.\n", + "\n", + "This had been possible in principle since 1993, but not many people had realized it yet. I had been intimately involved with building the infrastructure of the web for most of that time, and a writer as well, and it had taken me 8 years to realize it. Even then it took me several years to understand the implications. It meant there would be a whole new generation of essays. [11]\n", + "\n", + "In the print era, the channel for publishing essays had been vanishingly small. Except for a few officially anointed thinkers who went to the right parties in New York, the only people allowed to publish essays were specialists writing about their specialties. There were so many essays that had never been written, because there had been no way to publish them. Now they could be, and I was going to write them. [12]\n", + "\n", + "I've worked on several different things, but to the extent there was a turning point where I figured out what to work on, it was when I started publishing essays online. From then on I knew that whatever else I did, I'd always write essays too.\n", + "\n", + "I knew that online essays would be a marginal medium at first. Socially they'd seem more like rants posted by nutjobs on their GeoCities sites than the genteel and beautifully typeset compositions published in The New Yorker. But by this point I knew enough to find that encouraging instead of discouraging.\n", + "\n", + "One of the most conspicuous patterns I've noticed in my life is how well it has worked, for me at least, to work on things that weren't prestigious. Still life has always been the least prestigious form of painting. Viaweb and Y Combinator both seemed lame when we started them. I still get the glassy eye from strangers when they ask what I'm writing, and I explain that it's an essay I'm going to publish on my web site. Even Lisp, though prestigious intellectually in something like the way Latin is, also seems about as hip.\n", + "\n", + "It's not that unprestigious types of work are good per se. But when you find yourself drawn to some kind of work despite its current lack of prestige, it's a sign both that there's something real to be discovered there, and that you have the right kind of motives. Impure motives are a big danger for the ambitious. If anything is going to lead you astray, it will be the desire to impress people. So while working on things that aren't prestigious doesn't guarantee you're on the right track, it at least guarantees you're not on the most common type of wrong one.\n", + "\n", + "Over the next several years I wrote lots of essays about all kinds of different topics. O'Reilly reprinted a collection of them as a book, called Hackers & Painters after one of the essays in it. I also worked on spam filters, and did some more painting.\n", + "---------------------\n", + "Given the context information and not prior knowledge, answer the query.\n", + "Query: What are the 3 ingredients to great work?\n", + "Answer: \n", + "assistant: \n", + " ************\n", + " [Answer]: Passion, authenticity, and the right motives.\n", + " ************\n", + " [END DATA]\n", + "\n", + " Is the answer above factual or hallucinated based on the query and reference text?\n", + " \n", + "\n", + "llm evaluator response: hallucinated\n", + "\n", + "VALIDATION FAILED\n" + ] + }, + { + "output_type": "error", + "ename": "ValidationError", + "evalue": "Validation failed for field with errors: The LLM says hallucinated. The validation failed.", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mValidationError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 8\u001b[0m \u001b[0msample_prompt\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m\"\"\"What are the 3 ingredients to great work?\"\"\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 9\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 10\u001b[0;31m \u001b[0mresponse\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mquery_engine\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mquery\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msample_prompt\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 11\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mresponse\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/llama_index/core/instrumentation/dispatcher.py\u001b[0m in \u001b[0;36mwrapper\u001b[0;34m(func, instance, args, kwargs)\u001b[0m\n\u001b[1;32m 228\u001b[0m )\n\u001b[1;32m 229\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 230\u001b[0;31m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 231\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mBaseException\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 232\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mevent\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mSpanDropEvent\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mspan_id\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mid_\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0merr_str\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mstr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0me\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/llama_index/core/base/base_query_engine.py\u001b[0m in \u001b[0;36mquery\u001b[0;34m(self, str_or_query_bundle)\u001b[0m\n\u001b[1;32m 50\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mstr_or_query_bundle\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mstr\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 51\u001b[0m \u001b[0mstr_or_query_bundle\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mQueryBundle\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mstr_or_query_bundle\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 52\u001b[0;31m \u001b[0mquery_result\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_query\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mstr_or_query_bundle\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 53\u001b[0m dispatcher.event(\n\u001b[1;32m 54\u001b[0m \u001b[0mQueryEndEvent\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mquery\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mstr_or_query_bundle\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mresponse\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mquery_result\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/llama_index/core/instrumentation/dispatcher.py\u001b[0m in \u001b[0;36mwrapper\u001b[0;34m(func, instance, args, kwargs)\u001b[0m\n\u001b[1;32m 228\u001b[0m )\n\u001b[1;32m 229\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 230\u001b[0;31m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 231\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mBaseException\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 232\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mevent\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mSpanDropEvent\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mspan_id\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mid_\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0merr_str\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mstr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0me\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/llama_index/core/query_engine/retriever_query_engine.py\u001b[0m in \u001b[0;36m_query\u001b[0;34m(self, query_bundle)\u001b[0m\n\u001b[1;32m 188\u001b[0m ) as query_event:\n\u001b[1;32m 189\u001b[0m \u001b[0mnodes\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mretrieve\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mquery_bundle\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 190\u001b[0;31m response = self._response_synthesizer.synthesize(\n\u001b[0m\u001b[1;32m 191\u001b[0m \u001b[0mquery\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mquery_bundle\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 192\u001b[0m \u001b[0mnodes\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mnodes\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/llama_index/core/instrumentation/dispatcher.py\u001b[0m in \u001b[0;36mwrapper\u001b[0;34m(func, instance, args, kwargs)\u001b[0m\n\u001b[1;32m 228\u001b[0m )\n\u001b[1;32m 229\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 230\u001b[0;31m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 231\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mBaseException\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 232\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mevent\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mSpanDropEvent\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mspan_id\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mid_\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0merr_str\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mstr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0me\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/llama_index/core/response_synthesizers/base.py\u001b[0m in \u001b[0;36msynthesize\u001b[0;34m(self, query, nodes, additional_source_nodes, **response_kwargs)\u001b[0m\n\u001b[1;32m 249\u001b[0m \u001b[0mpayload\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m{\u001b[0m\u001b[0mEventPayload\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mQUERY_STR\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mquery\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mquery_str\u001b[0m\u001b[0;34m}\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 250\u001b[0m ) as event:\n\u001b[0;32m--> 251\u001b[0;31m response_str = self.get_response(\n\u001b[0m\u001b[1;32m 252\u001b[0m \u001b[0mquery_str\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mquery\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mquery_str\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 253\u001b[0m text_chunks=[\n", + "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/llama_index/core/instrumentation/dispatcher.py\u001b[0m in \u001b[0;36mwrapper\u001b[0;34m(func, instance, args, kwargs)\u001b[0m\n\u001b[1;32m 228\u001b[0m )\n\u001b[1;32m 229\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 230\u001b[0;31m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 231\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mBaseException\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 232\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mevent\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mSpanDropEvent\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mspan_id\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mid_\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0merr_str\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mstr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0me\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/llama_index/core/response_synthesizers/compact_and_refine.py\u001b[0m in \u001b[0;36mget_response\u001b[0;34m(self, query_str, text_chunks, prev_response, **response_kwargs)\u001b[0m\n\u001b[1;32m 41\u001b[0m \u001b[0;31m# the refine template does not account for size of previous answer.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 42\u001b[0m \u001b[0mnew_texts\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_make_compact_text_chunks\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mquery_str\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtext_chunks\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 43\u001b[0;31m return super().get_response(\n\u001b[0m\u001b[1;32m 44\u001b[0m \u001b[0mquery_str\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mquery_str\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 45\u001b[0m \u001b[0mtext_chunks\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mnew_texts\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/llama_index/core/instrumentation/dispatcher.py\u001b[0m in \u001b[0;36mwrapper\u001b[0;34m(func, instance, args, kwargs)\u001b[0m\n\u001b[1;32m 228\u001b[0m )\n\u001b[1;32m 229\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 230\u001b[0;31m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 231\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mBaseException\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 232\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mevent\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mSpanDropEvent\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mspan_id\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mid_\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0merr_str\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mstr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0me\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/llama_index/core/response_synthesizers/refine.py\u001b[0m in \u001b[0;36mget_response\u001b[0;34m(self, query_str, text_chunks, prev_response, **response_kwargs)\u001b[0m\n\u001b[1;32m 181\u001b[0m \u001b[0;31m# if this is the first chunk, and text chunk already\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 182\u001b[0m \u001b[0;31m# is an answer, then return it\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 183\u001b[0;31m response = self._give_response_single(\n\u001b[0m\u001b[1;32m 184\u001b[0m \u001b[0mquery_str\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtext_chunk\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mresponse_kwargs\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 185\u001b[0m )\n", + "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/llama_index/core/response_synthesizers/refine.py\u001b[0m in \u001b[0;36m_give_response_single\u001b[0;34m(self, query_str, text_chunk, **response_kwargs)\u001b[0m\n\u001b[1;32m 236\u001b[0m structured_response = cast(\n\u001b[1;32m 237\u001b[0m \u001b[0mStructuredRefineResponse\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 238\u001b[0;31m program(\n\u001b[0m\u001b[1;32m 239\u001b[0m \u001b[0mcontext_str\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mcur_text_chunk\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 240\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mresponse_kwargs\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/llama_index/core/instrumentation/dispatcher.py\u001b[0m in \u001b[0;36mwrapper\u001b[0;34m(func, instance, args, kwargs)\u001b[0m\n\u001b[1;32m 228\u001b[0m )\n\u001b[1;32m 229\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 230\u001b[0;31m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 231\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mBaseException\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 232\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mevent\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mSpanDropEvent\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mspan_id\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mid_\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0merr_str\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mstr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0me\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/llama_index/core/response_synthesizers/refine.py\u001b[0m in \u001b[0;36m__call__\u001b[0;34m(self, *args, **kwds)\u001b[0m\n\u001b[1;32m 82\u001b[0m \u001b[0manswer\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0manswer\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mjson\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 83\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 84\u001b[0;31m answer = self._llm.predict(\n\u001b[0m\u001b[1;32m 85\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_prompt\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 86\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwds\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/llama_index/core/instrumentation/dispatcher.py\u001b[0m in \u001b[0;36mwrapper\u001b[0;34m(func, instance, args, kwargs)\u001b[0m\n\u001b[1;32m 228\u001b[0m )\n\u001b[1;32m 229\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 230\u001b[0;31m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 231\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mBaseException\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 232\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mevent\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mSpanDropEvent\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mspan_id\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mid_\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0merr_str\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mstr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0me\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/llama_index/core/llms/llm.py\u001b[0m in \u001b[0;36mpredict\u001b[0;34m(self, prompt, **prompt_args)\u001b[0m\n\u001b[1;32m 557\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmetadata\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mis_chat_model\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 558\u001b[0m \u001b[0mmessages\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_get_messages\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mprompt\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mprompt_args\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 559\u001b[0;31m \u001b[0mchat_response\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mchat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmessages\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 560\u001b[0m \u001b[0moutput\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mchat_response\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmessage\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcontent\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0;34m\"\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 561\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/llama_index/core/instrumentation/dispatcher.py\u001b[0m in \u001b[0;36mwrapper\u001b[0;34m(func, instance, args, kwargs)\u001b[0m\n\u001b[1;32m 228\u001b[0m )\n\u001b[1;32m 229\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 230\u001b[0;31m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 231\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mBaseException\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 232\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mevent\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mSpanDropEvent\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mspan_id\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mid_\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0merr_str\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mstr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0me\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/llama_index/core/llms/callbacks.py\u001b[0m in \u001b[0;36mwrapped_llm_chat\u001b[0;34m(_self, messages, **kwargs)\u001b[0m\n\u001b[1;32m 170\u001b[0m )\n\u001b[1;32m 171\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 172\u001b[0;31m \u001b[0mf_return_val\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0m_self\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmessages\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 173\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mBaseException\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 174\u001b[0m callback_manager.on_event_end(\n", + "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/llama_index/core/llms/custom.py\u001b[0m in \u001b[0;36mchat\u001b[0;34m(self, messages, **kwargs)\u001b[0m\n\u001b[1;32m 30\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mchat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmessages\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mSequence\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mChatMessage\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mAny\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m->\u001b[0m \u001b[0mChatResponse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 31\u001b[0m \u001b[0mprompt\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmessages_to_prompt\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmessages\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 32\u001b[0;31m \u001b[0mcompletion_response\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcomplete\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mprompt\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mformatted\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 33\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mcompletion_response_to_chat_response\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcompletion_response\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 34\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/llama_index/core/instrumentation/dispatcher.py\u001b[0m in \u001b[0;36mwrapper\u001b[0;34m(func, instance, args, kwargs)\u001b[0m\n\u001b[1;32m 228\u001b[0m )\n\u001b[1;32m 229\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 230\u001b[0;31m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 231\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mBaseException\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 232\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mevent\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mSpanDropEvent\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mspan_id\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mid_\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0merr_str\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mstr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0me\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/llama_index/core/llms/callbacks.py\u001b[0m in \u001b[0;36mwrapped_llm_predict\u001b[0;34m(_self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 427\u001b[0m )\n\u001b[1;32m 428\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 429\u001b[0;31m \u001b[0mf_return_val\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0m_self\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 430\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mBaseException\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 431\u001b[0m callback_manager.on_event_end(\n", + "\u001b[0;32m\u001b[0m in \u001b[0;36mcomplete\u001b[0;34m(self, prompt, **kwargs)\u001b[0m\n\u001b[1;32m 46\u001b[0m \u001b[0;34m@\u001b[0m\u001b[0mllm_completion_callback\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 47\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mcomplete\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mprompt\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mstr\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mAny\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m->\u001b[0m \u001b[0mCompletionResponse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 48\u001b[0;31m \u001b[0mvalidated_response\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mmonkey_completion\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mprompt\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 49\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mCompletionResponse\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtext\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mvalidated_response\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mraw_llm_output\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 50\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m\u001b[0m in \u001b[0;36mmonkey_completion\u001b[0;34m(prompt, **kwargs)\u001b[0m\n\u001b[1;32m 18\u001b[0m \u001b[0m_\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0m_\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcontext_component_of_prompt\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mprompt\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpartition\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"Context information is below.\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 19\u001b[0m \u001b[0m_\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0m_\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mquery_component_of_prompt\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mprompt\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpartition\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"Query: \"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 20\u001b[0;31m return guard(\n\u001b[0m\u001b[1;32m 21\u001b[0m \u001b[0mllm_api\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mopenai\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mchat\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcompletions\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcreate\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 22\u001b[0m \u001b[0mprompt\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mprompt\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/guardrails/guard.py\u001b[0m in \u001b[0;36m__call__\u001b[0;34m(self, llm_api, prompt_params, num_reasks, prompt, instructions, msg_history, metadata, full_schema_reask, *args, **kwargs)\u001b[0m\n\u001b[1;32m 889\u001b[0m )\n\u001b[1;32m 890\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 891\u001b[0;31m return self._execute(\n\u001b[0m\u001b[1;32m 892\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 893\u001b[0m \u001b[0mllm_api\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mllm_api\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/guardrails/guard.py\u001b[0m in \u001b[0;36m_execute\u001b[0;34m(self, llm_api, llm_output, prompt_params, num_reasks, prompt, instructions, msg_history, reask_prompt, reask_instructions, metadata, full_schema_reask, *args, **kwargs)\u001b[0m\n\u001b[1;32m 769\u001b[0m \u001b[0mwrapped__exec\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mwrap_with_otel_context\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcurrent_otel_context\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0m__exec\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 770\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 771\u001b[0;31m return guard_context.run(\n\u001b[0m\u001b[1;32m 772\u001b[0m \u001b[0mwrapped__exec\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 773\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/guardrails/utils/telemetry_utils.py\u001b[0m in \u001b[0;36mwrapped_func\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 345\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 346\u001b[0m \u001b[0;31m# Execute 'func' within the attached context\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 347\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 348\u001b[0m \u001b[0;32mfinally\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 349\u001b[0m \u001b[0;31m# Ensure the context is detached after execution\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/guardrails/guard.py\u001b[0m in \u001b[0;36m__exec\u001b[0;34m(self, llm_api, llm_output, prompt_params, num_reasks, prompt, instructions, msg_history, metadata, full_schema_reask, *args, **kwargs)\u001b[0m\n\u001b[1;32m 746\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mhistory\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpush\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcall_log\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 747\u001b[0m \u001b[0;31m# Otherwise, call the LLM synchronously\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 748\u001b[0;31m return self._exec(\n\u001b[0m\u001b[1;32m 749\u001b[0m \u001b[0mllm_api\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mllm_api\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 750\u001b[0m \u001b[0mllm_output\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mllm_output\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/guardrails/guard.py\u001b[0m in \u001b[0;36m_exec\u001b[0;34m(self, llm_api, llm_output, call_log, prompt_params, num_reasks, metadata, full_schema_reask, prompt, instructions, msg_history, *args, **kwargs)\u001b[0m\n\u001b[1;32m 844\u001b[0m \u001b[0mexec_options\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_exec_opts\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 845\u001b[0m )\n\u001b[0;32m--> 846\u001b[0;31m \u001b[0mcall\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mrunner\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcall_log\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mcall_log\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mprompt_params\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mprompt_params\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 847\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mValidationOutcome\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mOT\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfrom_guard_history\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcall\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 848\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/guardrails/run/runner.py\u001b[0m in \u001b[0;36m__call__\u001b[0;34m(self, call_log, prompt_params)\u001b[0m\n\u001b[1;32m 243\u001b[0m \u001b[0;31m# Because Pydantic v1 doesn't respect property setters\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 244\u001b[0m \u001b[0mcall_log\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mexception\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 245\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 246\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mcall_log\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 247\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/guardrails/run/runner.py\u001b[0m in \u001b[0;36m__call__\u001b[0;34m(self, call_log, prompt_params)\u001b[0m\n\u001b[1;32m 195\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mindex\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mrange\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mnum_reasks\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 196\u001b[0m \u001b[0;31m# Run a single step.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 197\u001b[0;31m iteration = self.step(\n\u001b[0m\u001b[1;32m 198\u001b[0m \u001b[0mindex\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mindex\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 199\u001b[0m \u001b[0mapi\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mapi\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/guardrails/utils/telemetry_utils.py\u001b[0m in \u001b[0;36mto_trace_or_not_to_trace\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 262\u001b[0m \u001b[0;32mraise\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 263\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 264\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mfn\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 265\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 266\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mto_trace_or_not_to_trace\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/guardrails/run/runner.py\u001b[0m in \u001b[0;36mstep\u001b[0;34m(self, index, output_schema, call_log, api, instructions, prompt, msg_history, prompt_params, output)\u001b[0m\n\u001b[1;32m 336\u001b[0m \u001b[0miteration\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0moutputs\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0merror\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0merror_message\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 337\u001b[0m \u001b[0miteration\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0moutputs\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mexception\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 338\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 339\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0miteration\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 340\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/guardrails/run/runner.py\u001b[0m in \u001b[0;36mstep\u001b[0;34m(self, index, output_schema, call_log, api, instructions, prompt, msg_history, prompt_params, output)\u001b[0m\n\u001b[1;32m 321\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 322\u001b[0m \u001b[0;31m# Validate: run output validation.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 323\u001b[0;31m validated_output = self.validate(\n\u001b[0m\u001b[1;32m 324\u001b[0m \u001b[0miteration\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mindex\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mparsed_output\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0moutput_schema\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 325\u001b[0m )\n", + "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/guardrails/run/runner.py\u001b[0m in \u001b[0;36mvalidate\u001b[0;34m(self, iteration, attempt_number, parsed_output, output_schema, stream, **kwargs)\u001b[0m\n\u001b[1;32m 597\u001b[0m \u001b[0mstream\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 598\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 599\u001b[0;31m validated_output, metadata = validator_service.validate(\n\u001b[0m\u001b[1;32m 600\u001b[0m \u001b[0mvalue\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mparsed_output\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 601\u001b[0m \u001b[0mmetadata\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmetadata\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/guardrails/validator_service.py\u001b[0m in \u001b[0;36mvalidate\u001b[0;34m(value, metadata, validator_map, iteration, disable_tracer, path, stream, **kwargs)\u001b[0m\n\u001b[1;32m 737\u001b[0m \u001b[0mvalidator_service\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mSequentialValidatorService\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdisable_tracer\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 738\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 739\u001b[0;31m return validator_service.validate(\n\u001b[0m\u001b[1;32m 740\u001b[0m \u001b[0mvalue\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmetadata\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mvalidator_map\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0miteration\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mpath\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mpath\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 741\u001b[0m )\n", + "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/guardrails/validator_service.py\u001b[0m in \u001b[0;36mvalidate\u001b[0;34m(self, value, metadata, validator_map, iteration, absolute_path, reference_path, stream, **kwargs)\u001b[0m\n\u001b[1;32m 370\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 371\u001b[0m \u001b[0;31m# Then validate the parent value\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 372\u001b[0;31m value, metadata = self.run_validators(\n\u001b[0m\u001b[1;32m 373\u001b[0m \u001b[0miteration\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 374\u001b[0m \u001b[0mvalidator_map\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/guardrails/validator_service.py\u001b[0m in \u001b[0;36mrun_validators\u001b[0;34m(self, iteration, validator_map, value, metadata, absolute_property_path, reference_property_path, stream, **kwargs)\u001b[0m\n\u001b[1;32m 289\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 290\u001b[0m )\n\u001b[0;32m--> 291\u001b[0;31m value = self.perform_correction(\n\u001b[0m\u001b[1;32m 292\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0mresult\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 293\u001b[0m \u001b[0mvalue\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/guardrails/validator_service.py\u001b[0m in \u001b[0;36mperform_correction\u001b[0;34m(self, results, value, validator, on_fail_descriptor, rechecked_value)\u001b[0m\n\u001b[1;32m 99\u001b[0m )\n\u001b[1;32m 100\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mon_fail_descriptor\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0mOnFailAction\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mEXCEPTION\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 101\u001b[0;31m raise ValidationError(\n\u001b[0m\u001b[1;32m 102\u001b[0m \u001b[0;34m\"Validation failed for field with errors: \"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 103\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0;34m\", \"\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mjoin\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mresult\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0merror_message\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mresult\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mresults\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mValidationError\u001b[0m: Validation failed for field with errors: The LLM says hallucinated. The validation failed." + ] + } + ], + "source": [ + "from guardrails.errors import ValidationError\n", + "\n", + "query_engine = index.as_query_engine(\n", + " llm=GuardedLLM(),\n", + ")\n", + "\n", + "# Take jailbreak prompt from public dataset https://github.com/verazuo/jailbreak_llms/blob/main/data/prompts/jailbreak_prompts_2023_05_07.csv\n", + "sample_prompt = \"\"\"What are the 3 ingredients to great work?\"\"\"\n", + "\n", + "response = query_engine.query(sample_prompt)\n", + "print(response)" + ] + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + }, + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} \ No newline at end of file From 73b6506462aef9d6b245d3e132fc8f4a06240de2 Mon Sep 17 00:00:00 2001 From: Julia Gomes Date: Fri, 26 Jul 2024 14:40:12 -0700 Subject: [PATCH 2/8] Fix --- .github/workflows/pr_qc.yml | 19 ++ CONTRIBUTING.md | 38 ++++ LICENSE.txt | 201 ++++++++++++++++++ Makefile.txt | 16 ++ README.md | 110 ++++++++++ env.txt | 0 gitignore.txt | 9 + pyproject.toml | 30 +++ tests/__init__.py | 3 + tests/test_validator.py | 27 +++ validator/__init__.py | 3 + .../benchmark_context_relevancy_prompt.py | 0 .../benchmark_hallucination_prompt.py | 0 .../benchmark_qa_correctness_prompt.py | 0 14 files changed, 456 insertions(+) create mode 100644 .github/workflows/pr_qc.yml create mode 100644 CONTRIBUTING.md create mode 100644 LICENSE.txt create mode 100644 Makefile.txt create mode 100644 README.md create mode 100644 env.txt create mode 100644 gitignore.txt create mode 100644 pyproject.toml create mode 100644 tests/__init__.py create mode 100644 tests/test_validator.py create mode 100644 validator/__init__.py rename benchmark_context_relevancy_prompt.py => validator/benchmark_context_relevancy_prompt.py (100%) rename benchmark_hallucination_prompt.py => validator/benchmark_hallucination_prompt.py (100%) rename benchmark_qa_correctness_prompt.py => validator/benchmark_qa_correctness_prompt.py (100%) diff --git a/.github/workflows/pr_qc.yml b/.github/workflows/pr_qc.yml new file mode 100644 index 0000000..7a46efe --- /dev/null +++ b/.github/workflows/pr_qc.yml @@ -0,0 +1,19 @@ +name: Pull Request Quality Checks +on: + pull_request: + types: [ opened, synchronize ] + branches: [ main ] +jobs: + run-qa: + runs-on: ubuntu-latest + steps: + - name: Check out repository code + uses: actions/checkout@v4 + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: 3.11 + - name: Run qa + run: | + pip install ".[dev]" + make qa diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 0000000..1d1dd29 --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,38 @@ +# Guardrails Validator Template + +## How to create a Guardrails Validator +- On the top right of the page, click "Use this template", select "create a new repository" and set a name for the package. See [Naming Conventions](#naming-conventions) below. +- Clone down the new repository. +- Modify the class in [validator/main.py](validator/main.py) with source code for the new validator + - Make sure that the class still inherits from `Validator` and has the `register_validator` annotation. + - Set the `name` in the `register_validator` to the name of the repo prefixed with your org as a namespace and set the appropriate data type. +- Change [validator/__init__.py](validator/__init__.py) to your new Validator classname instead of ValidatorTemplate +- Perform a self install with `make dev` or `pip install -e ".[dev]"` +- Locally test the validator with the [test instructions below](#testing-and-using-your-validator) +- Modify the README and follow the Validator Card format; you can find an example [here](https://github.com/guardrails-ai/lowercase/blob/main/README.md) + +* Note: This package uses a pyproject.toml file, on first run, run `make dev` to pull down and install all dependencies + +### Naming Conventions +1. Avoid using `is` and `bug` +2. Use snake_case: i.e. `_` to separate words. e.g. valid_address +3. For the description of the repo, write one sentence that says what the validator does; should be the same as the description in the pydoc string. +4. When annotating the class use the `{namespace}/{validator_name}` pattern: e.g. `@register_validator(name=“guardrails/valid_address”)` + +### Testing and using your validator +- Open [test/test-validator.py](test/test-validator.py) to test your new validator +- Import your new validator and modify `ValidatorTestObject` accordingly +- Modify the TEST_OUTPUT and TEST_FAIL_OUTPUT accordingly +- Run `python test/test-validator.py` via terminal, make sure the returned output reflects the input object +- Write advanced tests for failures, etc. + +## Upload your validator to the validator hub +- Update the [pyproject.toml](pyproject.toml) file and make necessary changes as follows: + - Update the `name` field to the name of your validator + - Update the `description` field to a short description of your validator + - Update the `authors` field to your name and email + - Add/update the `dependencies` field to include all dependencies your validator needs. +- If there are are any post-installation steps such as downloading tokenizers, logging into huggingface etc., update the [post-install.py](validator/post-install.py) file accordingly. +- You can add additional files to the [validator](validator) directory, but don't rename any existing files/directories. + - e.g. Add any environment variables (without the values, just the keys) to the [.env](.env) file. +- Ensure that there are no other dependencies or any additional steps required to run your validator. diff --git a/LICENSE.txt b/LICENSE.txt new file mode 100644 index 0000000..261eeb9 --- /dev/null +++ b/LICENSE.txt @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/Makefile.txt b/Makefile.txt new file mode 100644 index 0000000..414e178 --- /dev/null +++ b/Makefile.txt @@ -0,0 +1,16 @@ +dev: + pip install -e ".[dev]" + +lint: + ruff check . + +test: + pytest ./tests + +type: + pyright validator + +qa: + make lint + make type + make tests \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..5da02bf --- /dev/null +++ b/README.md @@ -0,0 +1,110 @@ +# Overview + +| Developed by | Guardrails AI | +| --- | --- | +| Date of development | Feb 15, 2024 | +| Validator type | Format | +| Blog | | +| License | Apache 2 | +| Input/Output | Output | + +## Description + +### Intended Use +This validator is a template for creating other validators, but for demonstrative purposes it ensures that a generated output is the literal `pass`. + +### Requirements + +* Dependencies: + - guardrails-ai>=0.4.0 + +* Foundation model access keys: + - OPENAI_API_KEY + +## Installation + +```bash +$ guardrails hub install hub://guardrails/validator_template +``` + +## Usage Examples + +### Validating string output via Python + +In this example, we apply the validator to a string output generated by an LLM. + +```python +# Import Guard and Validator +from guardrails.hub import ValidatorTemplate +from guardrails import Guard + +# Setup Guard +guard = Guard().use( + ValidatorTemplate +) + +guard.validate("pass") # Validator passes +guard.validate("fail") # Validator fails +``` + +### Validating JSON output via Python + +In this example, we apply the validator to a string field of a JSON output generated by an LLM. + +```python +# Import Guard and Validator +from pydantic import BaseModel, Field +from guardrails.hub import ValidatorTemplate +from guardrails import Guard + +# Initialize Validator +val = ValidatorTemplate() + +# Create Pydantic BaseModel +class Process(BaseModel): + process_name: str + status: str = Field(validators=[val]) + +# Create a Guard to check for valid Pydantic output +guard = Guard.from_pydantic(output_class=Process) + +# Run LLM output generating JSON through guard +guard.parse(""" +{ + "process_name": "templating", + "status": "pass" +} +""") +``` + +# API Reference + +**`__init__(self, on_fail="noop")`** +
    +Initializes a new instance of the ValidatorTemplate class. + +**Parameters** +- **`arg_1`** *(str)*: A placeholder argument to demonstrate how to use init arguments. +- **`arg_2`** *(str)*: Another placeholder argument to demonstrate how to use init arguments. +- **`on_fail`** *(str, Callable)*: The policy to enact when a validator fails. If `str`, must be one of `reask`, `fix`, `filter`, `refrain`, `noop`, `exception` or `fix_reask`. Otherwise, must be a function that is called when the validator fails. +
+
+ +**`validate(self, value, metadata) -> ValidationResult`** +
    +Validates the given `value` using the rules defined in this validator, relying on the `metadata` provided to customize the validation process. This method is automatically invoked by `guard.parse(...)`, ensuring the validation logic is applied to the input data. + +Note: + +1. This method should not be called directly by the user. Instead, invoke `guard.parse(...)` where this method will be called internally for each associated Validator. +2. When invoking `guard.parse(...)`, ensure to pass the appropriate `metadata` dictionary that includes keys and values required by this validator. If `guard` is associated with multiple validators, combine all necessary metadata into a single dictionary. + +**Parameters** +- **`value`** *(Any)*: The input value to validate. +- **`metadata`** *(dict)*: A dictionary containing metadata required for validation. Keys and values must match the expectations of this validator. + + + | Key | Type | Description | Default | + | --- | --- | --- | --- | + | `key1` | String | Description of key1's role. | N/A | +
diff --git a/env.txt b/env.txt new file mode 100644 index 0000000..e69de29 diff --git a/gitignore.txt b/gitignore.txt new file mode 100644 index 0000000..c8398b0 --- /dev/null +++ b/gitignore.txt @@ -0,0 +1,9 @@ +.python-version +__pycache__/ +build +*.egg-info +.venv +.pytest_cache +.ruff_cache +.vscode +.idea \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..d3574eb --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,30 @@ +[project] +name = "validator-template" +version = "0.0.0" +description = "Template repo for Guardrails Hub validators." +authors = [ + {name = "Guardrails AI", email = "contact@guardrailsai.com"} +] +license = {file = "LICENSE"} +readme = "README.md" +requires-python = ">= 3.8.1" +dependencies = [ + "guardrails-ai>=0.4.0" +] + +[project.optional-dependencies] +dev = [ + "pyright", + "pytest", + "ruff" +] + +[tool.pytest.ini_options] +minversion = "6.0" +addopts = "-rP" +testpaths = [ + "tests" +] + +[tool.pyright] +include = ["validator"] \ No newline at end of file diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..72a2623 --- /dev/null +++ b/tests/__init__.py @@ -0,0 +1,3 @@ +from .main import ValidatorTemplate + +__all__ = ["ValidatorTemplate"] diff --git a/tests/test_validator.py b/tests/test_validator.py new file mode 100644 index 0000000..77afb8a --- /dev/null +++ b/tests/test_validator.py @@ -0,0 +1,27 @@ +# to run these, run +# make tests + +from guardrails import Guard +import pytest +from validator import ValidatorTemplate + +# We use 'exception' as the validator's fail action, +# so we expect failures to always raise an Exception +# Learn more about corrective actions here: +# https://www.guardrailsai.com/docs/concepts/output/#%EF%B8%8F-specifying-corrective-actions +guard = Guard.from_string(validators=[ValidatorTemplate(arg_1="arg_1", arg_2="arg_2", on_fail="exception")]) + +def test_pass(): + test_output = "pass" + result = guard.parse(test_output) + + assert result.validation_passed is True + assert result.validated_output == test_output + +def test_fail(): + with pytest.raises(Exception) as exc_info: + test_output = "fail" + guard.parse(test_output) + + # Assert the exception has your error_message + assert str(exc_info.value) == "Validation failed for field with errors: {A descriptive but concise error message about why validation failed}" diff --git a/validator/__init__.py b/validator/__init__.py new file mode 100644 index 0000000..72a2623 --- /dev/null +++ b/validator/__init__.py @@ -0,0 +1,3 @@ +from .main import ValidatorTemplate + +__all__ = ["ValidatorTemplate"] diff --git a/benchmark_context_relevancy_prompt.py b/validator/benchmark_context_relevancy_prompt.py similarity index 100% rename from benchmark_context_relevancy_prompt.py rename to validator/benchmark_context_relevancy_prompt.py diff --git a/benchmark_hallucination_prompt.py b/validator/benchmark_hallucination_prompt.py similarity index 100% rename from benchmark_hallucination_prompt.py rename to validator/benchmark_hallucination_prompt.py diff --git a/benchmark_qa_correctness_prompt.py b/validator/benchmark_qa_correctness_prompt.py similarity index 100% rename from benchmark_qa_correctness_prompt.py rename to validator/benchmark_qa_correctness_prompt.py From b1b72556d644122d6018582540e9f9d3525209d0 Mon Sep 17 00:00:00 2001 From: Julia Gomes Date: Fri, 26 Jul 2024 14:46:22 -0700 Subject: [PATCH 3/8] Move notebook tutorial --- .../tutorial_arize_hallucination_evaluator_with_rag.ipynb | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename tutorial_arize_hallucination_evaluator_with_rag.ipynb => validator/tutorial_arize_hallucination_evaluator_with_rag.ipynb (100%) diff --git a/tutorial_arize_hallucination_evaluator_with_rag.ipynb b/validator/tutorial_arize_hallucination_evaluator_with_rag.ipynb similarity index 100% rename from tutorial_arize_hallucination_evaluator_with_rag.ipynb rename to validator/tutorial_arize_hallucination_evaluator_with_rag.ipynb From 00b56038135a88a4328061b0f90f0ec66ff05b19 Mon Sep 17 00:00:00 2001 From: Julia Gomes Date: Fri, 26 Jul 2024 14:54:55 -0700 Subject: [PATCH 4/8] Update notebook --- ...ize_hallucination_evaluator_with_rag.ipynb | 230 +++--------------- 1 file changed, 29 insertions(+), 201 deletions(-) diff --git a/validator/tutorial_arize_hallucination_evaluator_with_rag.ipynb b/validator/tutorial_arize_hallucination_evaluator_with_rag.ipynb index ac38d88..ddeb97a 100644 --- a/validator/tutorial_arize_hallucination_evaluator_with_rag.ipynb +++ b/validator/tutorial_arize_hallucination_evaluator_with_rag.ipynb @@ -12,23 +12,11 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "HY97cJFAk7Sc", - "outputId": "c3fb7ff5-988a-4e16-d7d3-732b043253df" + "id": "HY97cJFAk7Sc" }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "🔑 Enter your OpenAI API key: ··········\n" - ] - } - ], + "outputs": [], "source": [ "!pip install -qq 'openinference-instrumentation-llama-index>=0.1.6' 'openinference-instrumentation-llama-index>=0.1.6' llama-index-llms-openai opentelemetry-exporter-otlp llama-index>=0.10.3 \"llama-index-callbacks-arize-phoenix>=0.1.2\" arize-otel\n", "\n", @@ -51,24 +39,11 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "VCDUdapRrqpd", - "outputId": "53b29ff8-34da-4bba-c730-ce06d3e13511" + "id": "VCDUdapRrqpd" }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "🔑 Enter your Arize space key in the space settings page of the Arize UI: ··········\n", - "🔑 Enter your Arize API key in the space settings page of the Arize UI: ··········\n" - ] - } - ], + "outputs": [], "source": [ "from openinference.instrumentation.llama_index import LlamaIndexInstrumentor\n", "from arize_otel import register_otel, Endpoints\n", @@ -102,7 +77,7 @@ "collapsed": true, "id": "Q8nwDMRXfmjy" }, - "execution_count": 5, + "execution_count": null, "outputs": [] }, { @@ -294,12 +269,25 @@ "metadata": { "id": "WIdK2paCf828" }, - "execution_count": 20, + "execution_count": null, "outputs": [] }, { "cell_type": "code", - "execution_count": 25, + "source": [ + "# !guardrails hub install hub://arize-ai/llm_rag_evaluator\n", + "\n", + "# from guardrails.hub import LlmRagEvaluator" + ], + "metadata": { + "id": "TK22jVYzBTJT" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "execution_count": null, "metadata": { "id": "E5GV4hwGUxxB" }, @@ -323,7 +311,7 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": null, "metadata": { "id": "MpwPFAtDsrcf" }, @@ -401,25 +389,11 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": null, "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "p-g6O8tvsxKd", - "outputId": "459d1b27-d40c-4bd6-f302-25fc7644770b" + "id": "p-g6O8tvsxKd" }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - " % Total % Received % Xferd Average Speed Time Time Time Current\n", - " Dload Upload Total Spent Left Speed\n", - "\r 0 0 0 0 0 0 0 0 --:--:-- --:--:-- --:--:-- 0\r100 75042 100 75042 0 0 725k 0 --:--:-- --:--:-- --:--:-- 732k\n" - ] - } - ], + "outputs": [], "source": [ "!mkdir -p 'data/paul_graham/'\n", "!curl 'https://raw.githubusercontent.com/run-llama/llama_index/main/docs/docs/examples/data/paul_graham/paul_graham_essay.txt' > 'data/paul_graham/paul_graham_essay.txt'\n", @@ -437,157 +411,11 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": null, "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 1000 - }, - "id": "d1Rzk87ZwDo8", - "outputId": "b3889ab9-cb1c-4640-b06f-7f37be79b5cc" + "id": "d1Rzk87ZwDo8" }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "\n", - "evaluator prompt: \n", - " In this task, you will be presented with a query, a reference text and an answer. The answer is\n", - " generated to the question based on the reference text. The answer may contain false information. You\n", - " must use the reference text to determine if the answer to the question contains false information,\n", - " if the answer is a hallucination of facts. Your objective is to determine whether the answer text\n", - " contains factual information and is not a hallucination. A 'hallucination' refers to\n", - " an answer that is not based on the reference text or assumes information that is not available in\n", - " the reference text. Your response should be a single word: either \"factual\" or \"hallucinated\", and\n", - " it should not include any other text or characters. \"hallucinated\" indicates that the answer\n", - " provides factually inaccurate information to the query based on the reference text. \"factual\"\n", - " indicates that the answer to the question is correct relative to the reference text, and does not\n", - " contain made up information. Please read the query and reference text carefully before determining\n", - " your response.\n", - "\n", - " [BEGIN DATA]\n", - " ************\n", - " [Query]: What are the 3 ingredients to great work?\n", - "Answer: \n", - "assistant: \n", - " ************\n", - " [Reference text]: \n", - "---------------------\n", - "file_path: /content/data/paul_graham/paul_graham_essay.txt\n", - "\n", - "It's not that unprestigious types of work are good per se. But when you find yourself drawn to some kind of work despite its current lack of prestige, it's a sign both that there's something real to be discovered there, and that you have the right kind of motives. Impure motives are a big danger for the ambitious. If anything is going to lead you astray, it will be the desire to impress people. So while working on things that aren't prestigious doesn't guarantee you're on the right track, it at least guarantees you're not on the most common type of wrong one.\n", - "\n", - "Over the next several years I wrote lots of essays about all kinds of different topics. O'Reilly reprinted a collection of them as a book, called Hackers & Painters after one of the essays in it. I also worked on spam filters, and did some more painting. I used to have dinners for a group of friends every thursday night, which taught me how to cook for groups. And I bought another building in Cambridge, a former candy factory (and later, twas said, porn studio), to use as an office.\n", - "\n", - "One night in October 2003 there was a big party at my house. It was a clever idea of my friend Maria Daniels, who was one of the thursday diners. Three separate hosts would all invite their friends to one party. So for every guest, two thirds of the other guests would be people they didn't know but would probably like. One of the guests was someone I didn't know but would turn out to like a lot: a woman called Jessica Livingston. A couple days later I asked her out.\n", - "\n", - "Jessica was in charge of marketing at a Boston investment bank. This bank thought it understood startups, but over the next year, as she met friends of mine from the startup world, she was surprised how different reality was. And how colorful their stories were. So she decided to compile a book of interviews with startup founders.\n", - "\n", - "When the bank had financial problems and she had to fire half her staff, she started looking for a new job. In early 2005 she interviewed for a marketing job at a Boston VC firm. It took them weeks to make up their minds, and during this time I started telling her about all the things that needed to be fixed about venture capital. They should make a larger number of smaller investments instead of a handful of giant ones, they should be funding younger, more technical founders instead of MBAs, they should let the founders remain as CEO, and so on.\n", - "\n", - "One of my tricks for writing essays had always been to give talks. The prospect of having to stand up in front of a group of people and tell them something that won't waste their time is a great spur to the imagination. When the Harvard Computer Society, the undergrad computer club, asked me to give a talk, I decided I would tell them how to start a startup. Maybe they'd be able to avoid the worst of the mistakes we'd made.\n", - "\n", - "So I gave this talk, in the course of which I told them that the best sources of seed funding were successful startup founders, because then they'd be sources of advice too. Whereupon it seemed they were all looking expectantly at me. Horrified at the prospect of having my inbox flooded by business plans (if I'd only known), I blurted out \"But not me!\" and went on with the talk. But afterward it occurred to me that I should really stop procrastinating about angel investing. I'd been meaning to since Yahoo bought us, and now it was 7 years later and I still hadn't done one angel investment.\n", - "\n", - "Meanwhile I had been scheming with Robert and Trevor about projects we could work on together. I missed working with them, and it seemed like there had to be something we could collaborate on.\n", - "\n", - "As Jessica and I were walking home from dinner on March 11, at the corner of Garden and Walker streets, these three threads converged. Screw the VCs who were taking so long to make up their minds. We'd start our own investment firm and actually implement the ideas we'd been talking about. I'd fund it, and Jessica could quit her job and work for it, and we'd get Robert and Trevor as partners too. [13]\n", - "\n", - "Once again, ignorance worked in our favor. We had no idea how to be angel investors, and in Boston in 2005 there were no Ron Conways to learn from. So we just made what seemed like the obvious choices, and some of the things we did turned out to be novel.\n", - "\n", - "There are multiple components to Y Combinator, and we didn't figure them all out at once. The part we got first was to be an angel firm.\n", - "\n", - "file_path: /content/data/paul_graham/paul_graham_essay.txt\n", - "\n", - "Much to my surprise, the time I spent working on this stuff was not wasted after all. After we started Y Combinator, I would often encounter startups working on parts of this new architecture, and it was very useful to have spent so much time thinking about it and even trying to write some of it.\n", - "\n", - "The subset I would build as an open source project was the new Lisp, whose parentheses I now wouldn't even have to hide. A lot of Lisp hackers dream of building a new Lisp, partly because one of the distinctive features of the language is that it has dialects, and partly, I think, because we have in our minds a Platonic form of Lisp that all existing dialects fall short of. I certainly did. So at the end of the summer Dan and I switched to working on this new dialect of Lisp, which I called Arc, in a house I bought in Cambridge.\n", - "\n", - "The following spring, lightning struck. I was invited to give a talk at a Lisp conference, so I gave one about how we'd used Lisp at Viaweb. Afterward I put a postscript file of this talk online, on paulgraham.com, which I'd created years before using Viaweb but had never used for anything. In one day it got 30,000 page views. What on earth had happened? The referring urls showed that someone had posted it on Slashdot. [10]\n", - "\n", - "Wow, I thought, there's an audience. If I write something and put it on the web, anyone can read it. That may seem obvious now, but it was surprising then. In the print era there was a narrow channel to readers, guarded by fierce monsters known as editors. The only way to get an audience for anything you wrote was to get it published as a book, or in a newspaper or magazine. Now anyone could publish anything.\n", - "\n", - "This had been possible in principle since 1993, but not many people had realized it yet. I had been intimately involved with building the infrastructure of the web for most of that time, and a writer as well, and it had taken me 8 years to realize it. Even then it took me several years to understand the implications. It meant there would be a whole new generation of essays. [11]\n", - "\n", - "In the print era, the channel for publishing essays had been vanishingly small. Except for a few officially anointed thinkers who went to the right parties in New York, the only people allowed to publish essays were specialists writing about their specialties. There were so many essays that had never been written, because there had been no way to publish them. Now they could be, and I was going to write them. [12]\n", - "\n", - "I've worked on several different things, but to the extent there was a turning point where I figured out what to work on, it was when I started publishing essays online. From then on I knew that whatever else I did, I'd always write essays too.\n", - "\n", - "I knew that online essays would be a marginal medium at first. Socially they'd seem more like rants posted by nutjobs on their GeoCities sites than the genteel and beautifully typeset compositions published in The New Yorker. But by this point I knew enough to find that encouraging instead of discouraging.\n", - "\n", - "One of the most conspicuous patterns I've noticed in my life is how well it has worked, for me at least, to work on things that weren't prestigious. Still life has always been the least prestigious form of painting. Viaweb and Y Combinator both seemed lame when we started them. I still get the glassy eye from strangers when they ask what I'm writing, and I explain that it's an essay I'm going to publish on my web site. Even Lisp, though prestigious intellectually in something like the way Latin is, also seems about as hip.\n", - "\n", - "It's not that unprestigious types of work are good per se. But when you find yourself drawn to some kind of work despite its current lack of prestige, it's a sign both that there's something real to be discovered there, and that you have the right kind of motives. Impure motives are a big danger for the ambitious. If anything is going to lead you astray, it will be the desire to impress people. So while working on things that aren't prestigious doesn't guarantee you're on the right track, it at least guarantees you're not on the most common type of wrong one.\n", - "\n", - "Over the next several years I wrote lots of essays about all kinds of different topics. O'Reilly reprinted a collection of them as a book, called Hackers & Painters after one of the essays in it. I also worked on spam filters, and did some more painting.\n", - "---------------------\n", - "Given the context information and not prior knowledge, answer the query.\n", - "Query: What are the 3 ingredients to great work?\n", - "Answer: \n", - "assistant: \n", - " ************\n", - " [Answer]: Passion, authenticity, and the right motives.\n", - " ************\n", - " [END DATA]\n", - "\n", - " Is the answer above factual or hallucinated based on the query and reference text?\n", - " \n", - "\n", - "llm evaluator response: hallucinated\n", - "\n", - "VALIDATION FAILED\n" - ] - }, - { - "output_type": "error", - "ename": "ValidationError", - "evalue": "Validation failed for field with errors: The LLM says hallucinated. The validation failed.", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mValidationError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 8\u001b[0m \u001b[0msample_prompt\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m\"\"\"What are the 3 ingredients to great work?\"\"\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 9\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 10\u001b[0;31m \u001b[0mresponse\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mquery_engine\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mquery\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msample_prompt\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 11\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mresponse\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/llama_index/core/instrumentation/dispatcher.py\u001b[0m in \u001b[0;36mwrapper\u001b[0;34m(func, instance, args, kwargs)\u001b[0m\n\u001b[1;32m 228\u001b[0m )\n\u001b[1;32m 229\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 230\u001b[0;31m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 231\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mBaseException\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 232\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mevent\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mSpanDropEvent\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mspan_id\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mid_\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0merr_str\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mstr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0me\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/llama_index/core/base/base_query_engine.py\u001b[0m in \u001b[0;36mquery\u001b[0;34m(self, str_or_query_bundle)\u001b[0m\n\u001b[1;32m 50\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mstr_or_query_bundle\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mstr\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 51\u001b[0m \u001b[0mstr_or_query_bundle\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mQueryBundle\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mstr_or_query_bundle\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 52\u001b[0;31m \u001b[0mquery_result\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_query\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mstr_or_query_bundle\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 53\u001b[0m dispatcher.event(\n\u001b[1;32m 54\u001b[0m \u001b[0mQueryEndEvent\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mquery\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mstr_or_query_bundle\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mresponse\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mquery_result\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/llama_index/core/instrumentation/dispatcher.py\u001b[0m in \u001b[0;36mwrapper\u001b[0;34m(func, instance, args, kwargs)\u001b[0m\n\u001b[1;32m 228\u001b[0m )\n\u001b[1;32m 229\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 230\u001b[0;31m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 231\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mBaseException\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 232\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mevent\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mSpanDropEvent\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mspan_id\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mid_\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0merr_str\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mstr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0me\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/llama_index/core/query_engine/retriever_query_engine.py\u001b[0m in \u001b[0;36m_query\u001b[0;34m(self, query_bundle)\u001b[0m\n\u001b[1;32m 188\u001b[0m ) as query_event:\n\u001b[1;32m 189\u001b[0m \u001b[0mnodes\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mretrieve\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mquery_bundle\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 190\u001b[0;31m response = self._response_synthesizer.synthesize(\n\u001b[0m\u001b[1;32m 191\u001b[0m \u001b[0mquery\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mquery_bundle\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 192\u001b[0m \u001b[0mnodes\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mnodes\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/llama_index/core/instrumentation/dispatcher.py\u001b[0m in \u001b[0;36mwrapper\u001b[0;34m(func, instance, args, kwargs)\u001b[0m\n\u001b[1;32m 228\u001b[0m )\n\u001b[1;32m 229\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 230\u001b[0;31m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 231\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mBaseException\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 232\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mevent\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mSpanDropEvent\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mspan_id\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mid_\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0merr_str\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mstr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0me\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/llama_index/core/response_synthesizers/base.py\u001b[0m in \u001b[0;36msynthesize\u001b[0;34m(self, query, nodes, additional_source_nodes, **response_kwargs)\u001b[0m\n\u001b[1;32m 249\u001b[0m \u001b[0mpayload\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m{\u001b[0m\u001b[0mEventPayload\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mQUERY_STR\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mquery\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mquery_str\u001b[0m\u001b[0;34m}\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 250\u001b[0m ) as event:\n\u001b[0;32m--> 251\u001b[0;31m response_str = self.get_response(\n\u001b[0m\u001b[1;32m 252\u001b[0m \u001b[0mquery_str\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mquery\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mquery_str\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 253\u001b[0m text_chunks=[\n", - "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/llama_index/core/instrumentation/dispatcher.py\u001b[0m in \u001b[0;36mwrapper\u001b[0;34m(func, instance, args, kwargs)\u001b[0m\n\u001b[1;32m 228\u001b[0m )\n\u001b[1;32m 229\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 230\u001b[0;31m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 231\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mBaseException\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 232\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mevent\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mSpanDropEvent\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mspan_id\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mid_\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0merr_str\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mstr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0me\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/llama_index/core/response_synthesizers/compact_and_refine.py\u001b[0m in \u001b[0;36mget_response\u001b[0;34m(self, query_str, text_chunks, prev_response, **response_kwargs)\u001b[0m\n\u001b[1;32m 41\u001b[0m \u001b[0;31m# the refine template does not account for size of previous answer.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 42\u001b[0m \u001b[0mnew_texts\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_make_compact_text_chunks\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mquery_str\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtext_chunks\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 43\u001b[0;31m return super().get_response(\n\u001b[0m\u001b[1;32m 44\u001b[0m \u001b[0mquery_str\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mquery_str\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 45\u001b[0m \u001b[0mtext_chunks\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mnew_texts\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/llama_index/core/instrumentation/dispatcher.py\u001b[0m in \u001b[0;36mwrapper\u001b[0;34m(func, instance, args, kwargs)\u001b[0m\n\u001b[1;32m 228\u001b[0m )\n\u001b[1;32m 229\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 230\u001b[0;31m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 231\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mBaseException\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 232\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mevent\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mSpanDropEvent\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mspan_id\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mid_\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0merr_str\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mstr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0me\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/llama_index/core/response_synthesizers/refine.py\u001b[0m in \u001b[0;36mget_response\u001b[0;34m(self, query_str, text_chunks, prev_response, **response_kwargs)\u001b[0m\n\u001b[1;32m 181\u001b[0m \u001b[0;31m# if this is the first chunk, and text chunk already\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 182\u001b[0m \u001b[0;31m# is an answer, then return it\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 183\u001b[0;31m response = self._give_response_single(\n\u001b[0m\u001b[1;32m 184\u001b[0m \u001b[0mquery_str\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtext_chunk\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mresponse_kwargs\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 185\u001b[0m )\n", - "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/llama_index/core/response_synthesizers/refine.py\u001b[0m in \u001b[0;36m_give_response_single\u001b[0;34m(self, query_str, text_chunk, **response_kwargs)\u001b[0m\n\u001b[1;32m 236\u001b[0m structured_response = cast(\n\u001b[1;32m 237\u001b[0m \u001b[0mStructuredRefineResponse\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 238\u001b[0;31m program(\n\u001b[0m\u001b[1;32m 239\u001b[0m \u001b[0mcontext_str\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mcur_text_chunk\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 240\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mresponse_kwargs\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/llama_index/core/instrumentation/dispatcher.py\u001b[0m in \u001b[0;36mwrapper\u001b[0;34m(func, instance, args, kwargs)\u001b[0m\n\u001b[1;32m 228\u001b[0m )\n\u001b[1;32m 229\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 230\u001b[0;31m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 231\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mBaseException\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 232\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mevent\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mSpanDropEvent\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mspan_id\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mid_\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0merr_str\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mstr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0me\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/llama_index/core/response_synthesizers/refine.py\u001b[0m in \u001b[0;36m__call__\u001b[0;34m(self, *args, **kwds)\u001b[0m\n\u001b[1;32m 82\u001b[0m \u001b[0manswer\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0manswer\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mjson\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 83\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 84\u001b[0;31m answer = self._llm.predict(\n\u001b[0m\u001b[1;32m 85\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_prompt\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 86\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwds\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/llama_index/core/instrumentation/dispatcher.py\u001b[0m in \u001b[0;36mwrapper\u001b[0;34m(func, instance, args, kwargs)\u001b[0m\n\u001b[1;32m 228\u001b[0m )\n\u001b[1;32m 229\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 230\u001b[0;31m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 231\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mBaseException\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 232\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mevent\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mSpanDropEvent\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mspan_id\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mid_\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0merr_str\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mstr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0me\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/llama_index/core/llms/llm.py\u001b[0m in \u001b[0;36mpredict\u001b[0;34m(self, prompt, **prompt_args)\u001b[0m\n\u001b[1;32m 557\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmetadata\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mis_chat_model\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 558\u001b[0m \u001b[0mmessages\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_get_messages\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mprompt\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mprompt_args\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 559\u001b[0;31m \u001b[0mchat_response\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mchat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmessages\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 560\u001b[0m \u001b[0moutput\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mchat_response\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmessage\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcontent\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0;34m\"\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 561\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/llama_index/core/instrumentation/dispatcher.py\u001b[0m in \u001b[0;36mwrapper\u001b[0;34m(func, instance, args, kwargs)\u001b[0m\n\u001b[1;32m 228\u001b[0m )\n\u001b[1;32m 229\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 230\u001b[0;31m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 231\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mBaseException\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 232\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mevent\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mSpanDropEvent\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mspan_id\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mid_\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0merr_str\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mstr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0me\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/llama_index/core/llms/callbacks.py\u001b[0m in \u001b[0;36mwrapped_llm_chat\u001b[0;34m(_self, messages, **kwargs)\u001b[0m\n\u001b[1;32m 170\u001b[0m )\n\u001b[1;32m 171\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 172\u001b[0;31m \u001b[0mf_return_val\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0m_self\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmessages\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 173\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mBaseException\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 174\u001b[0m callback_manager.on_event_end(\n", - "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/llama_index/core/llms/custom.py\u001b[0m in \u001b[0;36mchat\u001b[0;34m(self, messages, **kwargs)\u001b[0m\n\u001b[1;32m 30\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mchat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmessages\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mSequence\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mChatMessage\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mAny\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m->\u001b[0m \u001b[0mChatResponse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 31\u001b[0m \u001b[0mprompt\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmessages_to_prompt\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmessages\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 32\u001b[0;31m \u001b[0mcompletion_response\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcomplete\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mprompt\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mformatted\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 33\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mcompletion_response_to_chat_response\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcompletion_response\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 34\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/llama_index/core/instrumentation/dispatcher.py\u001b[0m in \u001b[0;36mwrapper\u001b[0;34m(func, instance, args, kwargs)\u001b[0m\n\u001b[1;32m 228\u001b[0m )\n\u001b[1;32m 229\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 230\u001b[0;31m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 231\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mBaseException\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 232\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mevent\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mSpanDropEvent\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mspan_id\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mid_\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0merr_str\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mstr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0me\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/llama_index/core/llms/callbacks.py\u001b[0m in \u001b[0;36mwrapped_llm_predict\u001b[0;34m(_self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 427\u001b[0m )\n\u001b[1;32m 428\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 429\u001b[0;31m \u001b[0mf_return_val\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0m_self\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 430\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mBaseException\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 431\u001b[0m callback_manager.on_event_end(\n", - "\u001b[0;32m\u001b[0m in \u001b[0;36mcomplete\u001b[0;34m(self, prompt, **kwargs)\u001b[0m\n\u001b[1;32m 46\u001b[0m \u001b[0;34m@\u001b[0m\u001b[0mllm_completion_callback\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 47\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mcomplete\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mprompt\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mstr\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mAny\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m->\u001b[0m \u001b[0mCompletionResponse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 48\u001b[0;31m \u001b[0mvalidated_response\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mmonkey_completion\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mprompt\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 49\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mCompletionResponse\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtext\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mvalidated_response\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mraw_llm_output\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 50\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m\u001b[0m in \u001b[0;36mmonkey_completion\u001b[0;34m(prompt, **kwargs)\u001b[0m\n\u001b[1;32m 18\u001b[0m \u001b[0m_\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0m_\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcontext_component_of_prompt\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mprompt\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpartition\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"Context information is below.\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 19\u001b[0m \u001b[0m_\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0m_\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mquery_component_of_prompt\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mprompt\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpartition\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"Query: \"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 20\u001b[0;31m return guard(\n\u001b[0m\u001b[1;32m 21\u001b[0m \u001b[0mllm_api\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mopenai\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mchat\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcompletions\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcreate\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 22\u001b[0m \u001b[0mprompt\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mprompt\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/guardrails/guard.py\u001b[0m in \u001b[0;36m__call__\u001b[0;34m(self, llm_api, prompt_params, num_reasks, prompt, instructions, msg_history, metadata, full_schema_reask, *args, **kwargs)\u001b[0m\n\u001b[1;32m 889\u001b[0m )\n\u001b[1;32m 890\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 891\u001b[0;31m return self._execute(\n\u001b[0m\u001b[1;32m 892\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 893\u001b[0m \u001b[0mllm_api\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mllm_api\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/guardrails/guard.py\u001b[0m in \u001b[0;36m_execute\u001b[0;34m(self, llm_api, llm_output, prompt_params, num_reasks, prompt, instructions, msg_history, reask_prompt, reask_instructions, metadata, full_schema_reask, *args, **kwargs)\u001b[0m\n\u001b[1;32m 769\u001b[0m \u001b[0mwrapped__exec\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mwrap_with_otel_context\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcurrent_otel_context\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0m__exec\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 770\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 771\u001b[0;31m return guard_context.run(\n\u001b[0m\u001b[1;32m 772\u001b[0m \u001b[0mwrapped__exec\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 773\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/guardrails/utils/telemetry_utils.py\u001b[0m in \u001b[0;36mwrapped_func\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 345\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 346\u001b[0m \u001b[0;31m# Execute 'func' within the attached context\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 347\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 348\u001b[0m \u001b[0;32mfinally\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 349\u001b[0m \u001b[0;31m# Ensure the context is detached after execution\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/guardrails/guard.py\u001b[0m in \u001b[0;36m__exec\u001b[0;34m(self, llm_api, llm_output, prompt_params, num_reasks, prompt, instructions, msg_history, metadata, full_schema_reask, *args, **kwargs)\u001b[0m\n\u001b[1;32m 746\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mhistory\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpush\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcall_log\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 747\u001b[0m \u001b[0;31m# Otherwise, call the LLM synchronously\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 748\u001b[0;31m return self._exec(\n\u001b[0m\u001b[1;32m 749\u001b[0m \u001b[0mllm_api\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mllm_api\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 750\u001b[0m \u001b[0mllm_output\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mllm_output\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/guardrails/guard.py\u001b[0m in \u001b[0;36m_exec\u001b[0;34m(self, llm_api, llm_output, call_log, prompt_params, num_reasks, metadata, full_schema_reask, prompt, instructions, msg_history, *args, **kwargs)\u001b[0m\n\u001b[1;32m 844\u001b[0m \u001b[0mexec_options\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_exec_opts\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 845\u001b[0m )\n\u001b[0;32m--> 846\u001b[0;31m \u001b[0mcall\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mrunner\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcall_log\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mcall_log\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mprompt_params\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mprompt_params\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 847\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mValidationOutcome\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mOT\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfrom_guard_history\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcall\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 848\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/guardrails/run/runner.py\u001b[0m in \u001b[0;36m__call__\u001b[0;34m(self, call_log, prompt_params)\u001b[0m\n\u001b[1;32m 243\u001b[0m \u001b[0;31m# Because Pydantic v1 doesn't respect property setters\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 244\u001b[0m \u001b[0mcall_log\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mexception\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 245\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 246\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mcall_log\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 247\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/guardrails/run/runner.py\u001b[0m in \u001b[0;36m__call__\u001b[0;34m(self, call_log, prompt_params)\u001b[0m\n\u001b[1;32m 195\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mindex\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mrange\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mnum_reasks\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 196\u001b[0m \u001b[0;31m# Run a single step.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 197\u001b[0;31m iteration = self.step(\n\u001b[0m\u001b[1;32m 198\u001b[0m \u001b[0mindex\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mindex\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 199\u001b[0m \u001b[0mapi\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mapi\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/guardrails/utils/telemetry_utils.py\u001b[0m in \u001b[0;36mto_trace_or_not_to_trace\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 262\u001b[0m \u001b[0;32mraise\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 263\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 264\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mfn\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 265\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 266\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mto_trace_or_not_to_trace\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/guardrails/run/runner.py\u001b[0m in \u001b[0;36mstep\u001b[0;34m(self, index, output_schema, call_log, api, instructions, prompt, msg_history, prompt_params, output)\u001b[0m\n\u001b[1;32m 336\u001b[0m \u001b[0miteration\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0moutputs\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0merror\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0merror_message\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 337\u001b[0m \u001b[0miteration\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0moutputs\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mexception\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 338\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 339\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0miteration\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 340\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/guardrails/run/runner.py\u001b[0m in \u001b[0;36mstep\u001b[0;34m(self, index, output_schema, call_log, api, instructions, prompt, msg_history, prompt_params, output)\u001b[0m\n\u001b[1;32m 321\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 322\u001b[0m \u001b[0;31m# Validate: run output validation.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 323\u001b[0;31m validated_output = self.validate(\n\u001b[0m\u001b[1;32m 324\u001b[0m \u001b[0miteration\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mindex\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mparsed_output\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0moutput_schema\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 325\u001b[0m )\n", - "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/guardrails/run/runner.py\u001b[0m in \u001b[0;36mvalidate\u001b[0;34m(self, iteration, attempt_number, parsed_output, output_schema, stream, **kwargs)\u001b[0m\n\u001b[1;32m 597\u001b[0m \u001b[0mstream\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 598\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 599\u001b[0;31m validated_output, metadata = validator_service.validate(\n\u001b[0m\u001b[1;32m 600\u001b[0m \u001b[0mvalue\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mparsed_output\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 601\u001b[0m \u001b[0mmetadata\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmetadata\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/guardrails/validator_service.py\u001b[0m in \u001b[0;36mvalidate\u001b[0;34m(value, metadata, validator_map, iteration, disable_tracer, path, stream, **kwargs)\u001b[0m\n\u001b[1;32m 737\u001b[0m \u001b[0mvalidator_service\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mSequentialValidatorService\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdisable_tracer\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 738\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 739\u001b[0;31m return validator_service.validate(\n\u001b[0m\u001b[1;32m 740\u001b[0m \u001b[0mvalue\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmetadata\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mvalidator_map\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0miteration\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mpath\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mpath\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 741\u001b[0m )\n", - "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/guardrails/validator_service.py\u001b[0m in \u001b[0;36mvalidate\u001b[0;34m(self, value, metadata, validator_map, iteration, absolute_path, reference_path, stream, **kwargs)\u001b[0m\n\u001b[1;32m 370\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 371\u001b[0m \u001b[0;31m# Then validate the parent value\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 372\u001b[0;31m value, metadata = self.run_validators(\n\u001b[0m\u001b[1;32m 373\u001b[0m \u001b[0miteration\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 374\u001b[0m \u001b[0mvalidator_map\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/guardrails/validator_service.py\u001b[0m in \u001b[0;36mrun_validators\u001b[0;34m(self, iteration, validator_map, value, metadata, absolute_property_path, reference_property_path, stream, **kwargs)\u001b[0m\n\u001b[1;32m 289\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 290\u001b[0m )\n\u001b[0;32m--> 291\u001b[0;31m value = self.perform_correction(\n\u001b[0m\u001b[1;32m 292\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0mresult\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 293\u001b[0m \u001b[0mvalue\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/guardrails/validator_service.py\u001b[0m in \u001b[0;36mperform_correction\u001b[0;34m(self, results, value, validator, on_fail_descriptor, rechecked_value)\u001b[0m\n\u001b[1;32m 99\u001b[0m )\n\u001b[1;32m 100\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mon_fail_descriptor\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0mOnFailAction\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mEXCEPTION\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 101\u001b[0;31m raise ValidationError(\n\u001b[0m\u001b[1;32m 102\u001b[0m \u001b[0;34m\"Validation failed for field with errors: \"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 103\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0;34m\", \"\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mjoin\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mresult\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0merror_message\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mresult\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mresults\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;31mValidationError\u001b[0m: Validation failed for field with errors: The LLM says hallucinated. The validation failed." - ] - } - ], + "outputs": [], "source": [ "from guardrails.errors import ValidationError\n", "\n", From a5ebfc8453fe3501bc11a2a79d16b6f32e8921bf Mon Sep 17 00:00:00 2001 From: Julia Gomes Date: Fri, 26 Jul 2024 15:01:00 -0700 Subject: [PATCH 5/8] Some edits to notebook --- ...ize_hallucination_evaluator_with_rag.ipynb | 197 +----------------- 1 file changed, 3 insertions(+), 194 deletions(-) diff --git a/validator/tutorial_arize_hallucination_evaluator_with_rag.ipynb b/validator/tutorial_arize_hallucination_evaluator_with_rag.ipynb index ddeb97a..fd108b5 100644 --- a/validator/tutorial_arize_hallucination_evaluator_with_rag.ipynb +++ b/validator/tutorial_arize_hallucination_evaluator_with_rag.ipynb @@ -83,201 +83,10 @@ { "cell_type": "code", "source": [ - "import os\n", - "from typing import Any, Callable, Dict, Optional, Type\n", - "import logging\n", - "from abc import ABC, abstractmethod\n", - "\n", - "from guardrails.validator_base import (\n", - " FailResult,\n", - " PassResult,\n", - " ValidationResult,\n", - " Validator,\n", - " register_validator,\n", - ")\n", - "from guardrails.stores.context import get_call_kwarg\n", - "from litellm import completion, get_llm_provider\n", - "\n", - "logger = logging.getLogger(__name__)\n", - "\n", - "\n", - "class ArizeRagEvalPromptBase(ABC):\n", - " def __init__(self, prompt_name, **kwargs) -> None:\n", - " self.prompt_name = prompt_name\n", - "\n", - " @abstractmethod\n", - " def generate_prompt(self, user_input_message: str, reference_text: str, llm_response: str) -> str:\n", - " pass\n", - "\n", - "\n", - "class HallucinationPrompt(ArizeRagEvalPromptBase):\n", - " def generate_prompt(self, user_input_message: str, reference_text: str, llm_response: str) -> str:\n", - " return f\"\"\"\n", - " In this task, you will be presented with a query, a reference text and an answer. The answer is\n", - " generated to the question based on the reference text. The answer may contain false information. You\n", - " must use the reference text to determine if the answer to the question contains false information,\n", - " if the answer is a hallucination of facts. Your objective is to determine whether the answer text\n", - " contains factual information and is not a hallucination. A 'hallucination' refers to\n", - " an answer that is not based on the reference text or assumes information that is not available in\n", - " the reference text. Your response should be a single word: either \"factual\" or \"hallucinated\", and\n", - " it should not include any other text or characters. \"hallucinated\" indicates that the answer\n", - " provides factually inaccurate information to the query based on the reference text. \"factual\"\n", - " indicates that the answer to the question is correct relative to the reference text, and does not\n", - " contain made up information. Please read the query and reference text carefully before determining\n", - " your response.\n", - "\n", - " [BEGIN DATA]\n", - " ************\n", - " [Query]: {user_input_message}\n", - " ************\n", - " [Reference text]: {reference_text}\n", - " ************\n", - " [Answer]: {llm_response}\n", - " ************\n", - " [END DATA]\n", - "\n", - " Is the answer above factual or hallucinated based on the query and reference text?\n", - " \"\"\"\n", - "\n", - "\n", - "@register_validator(name=\"arize/llm_rag_evaluator\", data_type=\"string\")\n", - "class LlmRagEvaluator(Validator):\n", - " \"\"\"This class validates an output generated by a LiteLLM (LLM) model by prompting another LLM model to evaluate the output.\n", - "\n", - " **Key Properties**\n", - "\n", - " | Property | Description |\n", - " | ----------------------------- | --------------------------------- |\n", - " | Name for `format` attribute | `arize/relevancy_evaluator` |\n", - " | Supported data types | `string` |\n", - " | Programmatic fix | N/A |\n", - "\n", - " Args:\n", - " llm_callable (str, optional): The name of the LiteLLM model to use for validation. Defaults to \"gpt-3.5-turbo\".\n", - " on_fail (Callable, optional): A function to be called when validation fails. Defaults to None.\n", - " \"\"\"\n", - "\n", - " def __init__(\n", - " self,\n", - " eval_llm_prompt_generator: Type[ArizeRagEvalPromptBase],\n", - " llm_evaluator_fail_response: str,\n", - " llm_evaluator_pass_response: str,\n", - " llm_callable: str,\n", - " on_fail: Optional[Callable] = \"noop\",\n", - " **kwargs,\n", - " ):\n", - " super().__init__(\n", - " on_fail,\n", - " eval_llm_prompt_generator=eval_llm_prompt_generator,\n", - " llm_evaluator_fail_response=llm_evaluator_fail_response,\n", - " llm_evaluator_pass_response=llm_evaluator_pass_response,\n", - " llm_callable=llm_callable,\n", - " **kwargs)\n", - " self._llm_evaluator_prompt_generator = eval_llm_prompt_generator\n", - " self._llm_callable = llm_callable\n", - " self._fail_response = llm_evaluator_fail_response\n", - " self._pass_response = llm_evaluator_pass_response\n", - "\n", - " def get_llm_response(self, prompt: str) -> str:\n", - " \"\"\"Gets the response from the LLM.\n", - "\n", - " Args:\n", - " prompt (str): The prompt to send to the LLM.\n", - "\n", - " Returns:\n", - " str: The response from the LLM.\n", - " \"\"\"\n", - " # 0. Create messages\n", - " messages = [{\"content\": prompt, \"role\": \"user\"}]\n", - "\n", - " # 0b. Setup auth kwargs if the model is from OpenAI\n", - " kwargs = {}\n", - " _model, provider, *_rest = get_llm_provider(self._llm_callable)\n", - " if provider == \"openai\":\n", - " kwargs[\"api_key\"] = get_call_kwarg(\"api_key\") or os.environ.get(\"OPENAI_API_KEY\")\n", - "\n", - " # 1. Get LLM response\n", - " # Strip whitespace and convert to lowercase\n", - " try:\n", - " response = completion(model=self._llm_callable, messages=messages, **kwargs)\n", - " response = response.choices[0].message.content # type: ignore\n", - " response = response.strip().lower()\n", - " except Exception as e:\n", - " raise RuntimeError(f\"Error getting response from the LLM: {e}\") from e\n", - "\n", - " # 3. Return the response\n", - " return response\n", - "\n", - " def validate(self, value: Any, metadata: Dict) -> ValidationResult:\n", - " \"\"\"\n", - " Validates is based on the relevance of the reference text to the original question.\n", - "\n", - " Args:\n", - " value (Any): The value to validate. It must contain 'original_prompt' and 'reference_text' keys.\n", - " metadata (Dict): The metadata for the validation.\n", - " user_message: Required key. User query passed into RAG LLM.\n", - " context: Required key. Context used by RAG LLM.\n", - " llm_response: Optional key. By default, the gaurded LLM will make the RAG LLM call, which corresponds\n", - " to the `value`. If the user calls the guard with on=\"prompt\", then the original RAG LLM response\n", - " needs to be passed into the guard as metadata for the LLM judge to evaluate.\n", - "\n", - " Returns:\n", - " ValidationResult: The result of the validation. It can be a PassResult if the reference\n", - " text is relevant to the original question, or a FailResult otherwise.\n", - " \"\"\"\n", - " # 1. Get the question and arg from the value\n", - " user_input_message = metadata.get(\"user_message\")\n", - " if user_input_message is None:\n", - " raise RuntimeError(\n", - " \"original_prompt missing from value. \"\n", - " \"Please provide the original prompt.\"\n", - " )\n", - "\n", - " reference_text = metadata.get(\"context\")\n", - " if reference_text is None:\n", - " raise RuntimeError(\n", - " \"'reference_text' missing from value. \"\n", - " \"Please provide the reference text.\"\n", - " )\n", - "\n", - " # Option to override guarded LLM call with response passed in through metadata\n", - " if metadata.get(\"llm_response\") is not None:\n", - " value = metadata.get(\"llm_response\")\n", - "\n", - " # 2. Setup the prompt\n", - " prompt = self._llm_evaluator_prompt_generator.generate_prompt(user_input_message=user_input_message, reference_text=reference_text, llm_response=value)\n", - " print(f\"\\nevaluator prompt: {prompt}\")\n", - "\n", - " # 3. Get the LLM response\n", - " llm_response = self.get_llm_response(prompt)\n", - " print(f\"\\nllm evaluator response: {llm_response}\")\n", - "\n", - " # 4. Check the LLM response and return the result\n", - " if llm_response == self._fail_response:\n", - " print(f\"\\nVALIDATION FAILED\")\n", - " return FailResult(error_message=f\"The LLM says {self._fail_response}. The validation failed.\")\n", - "\n", - " if llm_response == self._pass_response:\n", - " print(f\"\\nVALIDATION PASSED\")\n", - " return PassResult()\n", - "\n", - " print(f\"\\nVALIDATION FAILED\")\n", - " return FailResult(\n", - " error_message=\"The LLM returned an invalid answer. Failing the validation...\"\n", - " )\n" - ], - "metadata": { - "id": "WIdK2paCf828" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "# !guardrails hub install hub://arize-ai/llm_rag_evaluator\n", + "!pip install --no-cache-dir -qq git+https://github.com/Arize-ai/rag-llm-prompt-evaluator-guard\n", + "!guardrails hub install hub://arize-ai/llm_rag_evaluator\n", "\n", - "# from guardrails.hub import LlmRagEvaluator" + "from guardrails.hub import LlmRagEvaluator" ], "metadata": { "id": "TK22jVYzBTJT" From fe3309fdc7154ba29fcc2ce20613e766caf34515 Mon Sep 17 00:00:00 2001 From: Julia Gomes Date: Fri, 26 Jul 2024 15:19:18 -0700 Subject: [PATCH 6/8] remove flow --- .github/workflows/pr_qc.yml | 19 --- build/lib/validator/__init__.py | 3 + .../benchmark_context_relevancy_prompt.py | 143 +++++++++++++++++ .../benchmark_hallucination_prompt.py | 147 +++++++++++++++++ .../benchmark_qa_correctness_prompt.py | 149 ++++++++++++++++++ validator_template.egg-info/PKG-INFO | 124 +++++++++++++++ validator_template.egg-info/SOURCES.txt | 13 ++ .../dependency_links.txt | 1 + validator_template.egg-info/requires.txt | 6 + validator_template.egg-info/top_level.txt | 1 + 10 files changed, 587 insertions(+), 19 deletions(-) delete mode 100644 .github/workflows/pr_qc.yml create mode 100644 build/lib/validator/__init__.py create mode 100644 build/lib/validator/benchmark_context_relevancy_prompt.py create mode 100644 build/lib/validator/benchmark_hallucination_prompt.py create mode 100644 build/lib/validator/benchmark_qa_correctness_prompt.py create mode 100644 validator_template.egg-info/PKG-INFO create mode 100644 validator_template.egg-info/SOURCES.txt create mode 100644 validator_template.egg-info/dependency_links.txt create mode 100644 validator_template.egg-info/requires.txt create mode 100644 validator_template.egg-info/top_level.txt diff --git a/.github/workflows/pr_qc.yml b/.github/workflows/pr_qc.yml deleted file mode 100644 index 7a46efe..0000000 --- a/.github/workflows/pr_qc.yml +++ /dev/null @@ -1,19 +0,0 @@ -name: Pull Request Quality Checks -on: - pull_request: - types: [ opened, synchronize ] - branches: [ main ] -jobs: - run-qa: - runs-on: ubuntu-latest - steps: - - name: Check out repository code - uses: actions/checkout@v4 - - name: Set up Python - uses: actions/setup-python@v5 - with: - python-version: 3.11 - - name: Run qa - run: | - pip install ".[dev]" - make qa diff --git a/build/lib/validator/__init__.py b/build/lib/validator/__init__.py new file mode 100644 index 0000000..72a2623 --- /dev/null +++ b/build/lib/validator/__init__.py @@ -0,0 +1,3 @@ +from .main import ValidatorTemplate + +__all__ = ["ValidatorTemplate"] diff --git a/build/lib/validator/benchmark_context_relevancy_prompt.py b/build/lib/validator/benchmark_context_relevancy_prompt.py new file mode 100644 index 0000000..cce71ff --- /dev/null +++ b/build/lib/validator/benchmark_context_relevancy_prompt.py @@ -0,0 +1,143 @@ +"""Script to evaluate Context Relevancy Guard on "wiki_qa-train" benchmark dataset. +* https://huggingface.co/datasets/microsoft/wiki_qa + +Model: gpt-4o-mini +Guard Results + precision recall f1-score support + + relevant 0.70 0.86 0.77 93 + unrelated 0.85 0.68 0.76 107 + + accuracy 0.77 200 + macro avg 0.78 0.77 0.76 200 +weighted avg 0.78 0.77 0.76 200 + +Latency +count 200.000000 +mean 2.812122 +std 1.753805 +min 1.067620 +25% 1.708051 +50% 2.248962 +75% 3.321251 +max 14.102804 +Name: guard_latency_gpt-4o-mini, dtype: float64 +median latency +2.2489616039965767 + +Model: gpt-4-turbo +Guard Results + precision recall f1-score support + + relevant 0.64 0.90 0.75 93 + unrelated 0.87 0.56 0.68 107 + + accuracy 0.72 200 + macro avg 0.76 0.73 0.72 200 +weighted avg 0.76 0.72 0.71 200 + +Latency +count 200.000000 +mean 8.561413 +std 6.425799 +min 1.624563 +25% 3.957226 +50% 5.979291 +75% 11.579224 +max 34.342637 +Name: guard_latency_gpt-4-turbo, dtype: float64 +median latency +5.979290812509134 +""" +import os +import time +from getpass import getpass +from typing import List, Tuple + +import openai +import pandas as pd +from sklearn.metrics import classification_report + +from guardrails import Guard +from main import ContextRelevancyPrompt, LlmRagEvaluator +from phoenix.evals import download_benchmark_dataset +from sklearn.utils import shuffle + + +RANDOM_STATE = 119 +MODELS = ["gpt-4o-mini", "gpt-4-turbo"] +N_EVAL_SAMPLE_SIZE = 200 +SAVE_RESULTS_PATH = "context_relevancy_guard_results.csv" + + +def evaluate_guard_on_dataset(test_dataset: pd.DataFrame, guard: Guard, model: str) -> Tuple[List[float], List[bool]]: + """Evaluate guard on benchmark dataset. + + :param test_dataset: Dataframe of test examples. + :param guard: Guard we want to evaluate. + + :return: Tuple where the first lists contains latency, and the second list contains a boolean indicating whether the guard passed. + """ + latency_measurements = [] + guard_passed = [] + for _, rag_example in test_dataset.iterrows(): + start_time = time.perf_counter() + response = guard( + llm_api=openai.chat.completions.create, + prompt=rag_example["query_text"], + model=model, + max_tokens=1024, + temperature=0.5, + metadata={ + "user_message": rag_example["query_text"], + "context": rag_example["document_text"], + } + ) + latency_measurements.append(time.perf_counter() - start_time) + guard_passed.append(response.validation_passed) + return latency_measurements, guard_passed + + +if __name__ == "__main__": + if not (openai_api_key := os.getenv("OPENAI_API_KEY")): + openai_api_key = getpass("🔑 Enter your OpenAI API key: ") + openai.api_key = openai_api_key + os.environ["OPENAI_API_KEY"] = openai_api_key + + # Columns: Index(['query_id', 'query_text', 'document_title', 'document_text', 'document_text_with_emphasis', 'relevant'] + test_dataset = download_benchmark_dataset( + task="binary-relevance-classification", + dataset_name="wiki_qa-train") + test_dataset = shuffle(test_dataset, random_state=RANDOM_STATE) + test_dataset = test_dataset[:N_EVAL_SAMPLE_SIZE] + + for model in MODELS: + guard = Guard.from_string( + validators=[ + LlmRagEvaluator( + eval_llm_prompt_generator=ContextRelevancyPrompt(prompt_name="context_relevancy_judge_llm"), + llm_evaluator_fail_response="unrelated", + llm_evaluator_pass_response="relevant", + llm_callable=model, + on_fail="noop", + on="prompt") + ], + ) + + latency_measurements, guard_passed = evaluate_guard_on_dataset(test_dataset=test_dataset, guard=guard, model=model) + test_dataset[f"guard_passed_{model}"] = guard_passed + test_dataset[f"guard_latency_{model}"] = latency_measurements + + print(f"\nModel: {model}") + print("Guard Results") + # Calculate precision, recall and f1-score for when the Guard fails (e.g. flags an irrelevant answer) + print(classification_report( + test_dataset["relevant"].replace(True, "relevant").replace(False, "unrelated"), + test_dataset[f"guard_passed_{model}"].replace(True, "relevant").replace(False, "unrelated"))) + print("Latency") + print(test_dataset[f"guard_latency_{model}"].describe()) + print("median latency") + print(test_dataset[f"guard_latency_{model}"].median()) + + if SAVE_RESULTS_PATH: + test_dataset.to_csv(SAVE_RESULTS_PATH) diff --git a/build/lib/validator/benchmark_hallucination_prompt.py b/build/lib/validator/benchmark_hallucination_prompt.py new file mode 100644 index 0000000..bf68f7b --- /dev/null +++ b/build/lib/validator/benchmark_hallucination_prompt.py @@ -0,0 +1,147 @@ +"""Script to evaluate Hallucination Guard on benchmark dataset. +Currently supported datasets include "halueval_qa_data" from the HaluEval benchmark: +* https://arxiv.org/abs/2305.11747 +* https://github.com/RUCAIBox/HaluEval + +Model: gpt-4o-mini +Guard Results + precision recall f1-score support + + factual 0.79 0.97 0.87 129 +hallucinated 0.96 0.73 0.83 121 + + accuracy 0.85 250 + macro avg 0.87 0.85 0.85 250 +weighted avg 0.87 0.85 0.85 250 + +Latency +count 250.000000 +mean 1.865513 +std 0.603700 +min 1.139974 +25% 1.531160 +50% 1.758210 +75% 2.026153 +max 6.403010 +Name: guard_latency_gpt-4o-mini, dtype: float64 +median latency +1.7582097915001214 + +Model: gpt-4-turbo +Guard Results + precision recall f1-score support + + factual 0.83 0.88 0.85 129 +hallucinated 0.87 0.80 0.83 121 + + accuracy 0.84 250 + macro avg 0.85 0.84 0.84 250 +weighted avg 0.85 0.84 0.84 250 + +Latency +count 250.000000 +mean 4.295613 +std 2.393394 +min 1.460899 +25% 2.868255 +50% 3.724649 +75% 4.939440 +max 23.465773 +Name: guard_latency_gpt-4-turbo, dtype: float64 +median latency +3.724648874514969 +""" +import os +import time +from getpass import getpass +from typing import List, Tuple + +import openai +import pandas as pd +from sklearn.metrics import classification_report +from sklearn.utils import shuffle + +from guardrails import Guard +from main import HallucinationPrompt, LlmRagEvaluator +from phoenix.evals import download_benchmark_dataset + + +RANDOM_STATE = 119 +MODELS = ["gpt-4o-mini", "gpt-4-turbo"] +N_EVAL_SAMPLE_SIZE = 250 +SAVE_RESULTS_PATH = "hallucination_guard_results.csv" + + +def evaluate_guard_on_dataset(test_dataset: pd.DataFrame, guard: Guard, model: str) -> Tuple[List[float], List[bool]]: + """Evaluate guard on benchmark dataset. + + :param test_dataset: Dataframe of test examples. + :param guard: Guard we want to evaluate. + + :return: Tuple where the first lists contains latency, and the second list contains a boolean indicating whether the guard passed. + """ + latency_measurements = [] + guard_passed = [] + for _, rag_example in test_dataset.iterrows(): + start_time = time.perf_counter() + response = guard( + llm_api=openai.chat.completions.create, + prompt=rag_example["query"], + model=model, + max_tokens=1024, + temperature=0.5, + metadata={ + "user_message": rag_example["query"], + "context": rag_example["reference"], + "llm_response": rag_example["response"], + } + ) + latency_measurements.append(time.perf_counter() - start_time) + guard_passed.append(response.validation_passed) + return latency_measurements, guard_passed + + +if __name__ == "__main__": + if not (openai_api_key := os.getenv("OPENAI_API_KEY")): + openai_api_key = getpass("🔑 Enter your OpenAI API key: ") + openai.api_key = openai_api_key + os.environ["OPENAI_API_KEY"] = openai_api_key + + # Columns: ['reference', 'query', 'response', 'is_hallucination'] + test_dataset = download_benchmark_dataset( + task="binary-hallucination-classification", + dataset_name="halueval_qa_data") + test_dataset = shuffle(test_dataset, random_state=119) + test_dataset = test_dataset[:N_EVAL_SAMPLE_SIZE] + + for model in MODELS: + guard = Guard.from_string( + validators=[ + LlmRagEvaluator( + eval_llm_prompt_generator=HallucinationPrompt(prompt_name="hallucination_judge_llm"), + llm_evaluator_fail_response="hallucinated", + llm_evaluator_pass_response="factual", + llm_callable=model, + on_fail="noop", + on="prompt") + ], + ) + + latency_measurements, guard_passed = evaluate_guard_on_dataset(test_dataset=test_dataset, guard=guard, model=model) + test_dataset[f"guard_passed_{model}"] = guard_passed + test_dataset[f"guard_latency_{model}"] = latency_measurements + + print(f"\nModel: {model}") + print("Guard Results") + # Calculate precision, recall and f1-score for when the Guard fails (e.g. flags a hallucination) + print(classification_report( + test_dataset["is_hallucination"].replace(True, "hallucinated").replace(False, "factual"), + test_dataset[f"guard_passed_{model}"].replace(True, "factual").replace(False, "hallucinated"))) + + print("Latency") + print(test_dataset[f"guard_latency_{model}"].describe()) + print("median latency") + print(test_dataset[f"guard_latency_{model}"].median()) + + if SAVE_RESULTS_PATH: + test_dataset.to_csv(SAVE_RESULTS_PATH) diff --git a/build/lib/validator/benchmark_qa_correctness_prompt.py b/build/lib/validator/benchmark_qa_correctness_prompt.py new file mode 100644 index 0000000..f821527 --- /dev/null +++ b/build/lib/validator/benchmark_qa_correctness_prompt.py @@ -0,0 +1,149 @@ +"""Script to evaluate QA Correctness Guard on benchmark dataset. +The 2.0 version of the large-scale dataset Stanford Question Answering Dataset (SQuAD 2.0) allows +researchers to design AI models for reading comprehension tasks under challenging constraints. +https://web.stanford.edu/class/archive/cs/cs224n/cs224n.1194/reports/default/15785042.pdf + +Model: gpt-4o-mini + +Guard Results + precision recall f1-score support + + correct 1.00 0.96 0.98 133 + incorrect 0.96 1.00 0.98 117 + + accuracy 0.98 250 + macro avg 0.98 0.98 0.98 250 +weighted avg 0.98 0.98 0.98 250 + +Latency +count 250.000000 +mean 2.610912 +std 1.415877 +min 1.148114 +25% 1.678278 +50% 2.263149 +75% 2.916726 +max 10.625763 +Name: guard_latency_gpt-4o-mini, dtype: float64 +median latency +2.263148645986803 + +Model: gpt-4-turbo + +Guard Results + precision recall f1-score support + + correct 1.00 0.92 0.96 133 + incorrect 0.91 1.00 0.96 117 + + accuracy 0.96 250 + macro avg 0.96 0.96 0.96 250 +weighted avg 0.96 0.96 0.96 250 + +Latency +count 250.000000 +mean 7.390556 +std 5.804535 +min 1.671949 +25% 3.544383 +50% 5.239343 +75% 8.484112 +max 30.651372 +Name: guard_latency_gpt-4-turbo, dtype: float64 +median latency +5.239343083492713 +""" +import os +import time +from getpass import getpass +from typing import List, Tuple + +import openai +import pandas as pd +from sklearn.metrics import classification_report + +from guardrails import Guard +from main import QACorrectnessPrompt, LlmRagEvaluator +from phoenix.evals import download_benchmark_dataset +from sklearn.utils import shuffle + + +RANDOM_STATE = 119 +MODELS = ["gpt-4o-mini", "gpt-4-turbo"] +N_EVAL_SAMPLE_SIZE = 250 +SAVE_RESULTS_PATH = "qa_correctness_guard_results.csv" + + +def evaluate_guard_on_dataset(test_dataset: pd.DataFrame, guard: Guard, model: str) -> Tuple[List[float], List[bool]]: + """Evaluate guard on benchmark dataset. + + :param test_dataset: Dataframe of test examples. + :param guard: Guard we want to evaluate. + + :return: Tuple where the first lists contains latency, and the second list contains a boolean indicating whether the guard passed. + """ + latency_measurements = [] + guard_passed = [] + for _, rag_example in test_dataset.iterrows(): + start_time = time.perf_counter() + response = guard( + llm_api=openai.chat.completions.create, + prompt=rag_example["question"], + model=model, + max_tokens=1024, + temperature=0.5, + metadata={ + "user_message": rag_example["question"], + "context": rag_example["context"], + "llm_response": rag_example["sampled_answer"], + } + ) + latency_measurements.append(time.perf_counter() - start_time) + guard_passed.append(response.validation_passed) + return latency_measurements, guard_passed + + +if __name__ == "__main__": + if not (openai_api_key := os.getenv("OPENAI_API_KEY")): + openai_api_key = getpass("🔑 Enter your OpenAI API key: ") + openai.api_key = openai_api_key + os.environ["OPENAI_API_KEY"] = openai_api_key + + # Columns: Index(['id', 'title', 'context', 'question', 'answers', 'correct_answer', 'wrong_answer', 'sampled_answer', 'answer_true'] + test_dataset = df = download_benchmark_dataset( + task="qa-classification", + dataset_name="qa_generated_dataset") + test_dataset = shuffle(test_dataset, random_state=RANDOM_STATE) + test_dataset = test_dataset[:N_EVAL_SAMPLE_SIZE] + + for model in MODELS: + guard = Guard.from_string( + validators=[ + LlmRagEvaluator( + eval_llm_prompt_generator=QACorrectnessPrompt(prompt_name="qa_correctness_judge_llm"), + llm_evaluator_fail_response="incorrect", + llm_evaluator_pass_response="correct", + llm_callable=model, + on_fail="noop", + on="prompt") + ], + ) + + latency_measurements, guard_passed = evaluate_guard_on_dataset(test_dataset=test_dataset, guard=guard, model=model) + test_dataset[f"guard_passed_{model}"] = guard_passed + test_dataset[f"guard_latency_{model}"] = latency_measurements + + print(f"\nModel: {model}") + print("\nGuard Results") + # Calculate precision, recall and f1-score for when the Guard fails (e.g. flags an incorrect answer) + print(classification_report( + test_dataset["answer_true"].replace(True, "correct").replace(False, "incorrect"), + test_dataset[f"guard_passed_{model}"].replace(True, "correct").replace(False, "incorrect"))) + + print("Latency") + print(test_dataset[f"guard_latency_{model}"].describe()) + print("median latency") + print(test_dataset[f"guard_latency_{model}"].median()) + + if SAVE_RESULTS_PATH: + test_dataset.to_csv(SAVE_RESULTS_PATH) diff --git a/validator_template.egg-info/PKG-INFO b/validator_template.egg-info/PKG-INFO new file mode 100644 index 0000000..f49d811 --- /dev/null +++ b/validator_template.egg-info/PKG-INFO @@ -0,0 +1,124 @@ +Metadata-Version: 2.1 +Name: validator-template +Version: 0.0.0 +Summary: Template repo for Guardrails Hub validators. +Author-email: Guardrails AI +Requires-Python: >=3.8.1 +Description-Content-Type: text/markdown +License-File: LICENSE.txt +Requires-Dist: guardrails-ai>=0.4.0 +Provides-Extra: dev +Requires-Dist: pyright; extra == "dev" +Requires-Dist: pytest; extra == "dev" +Requires-Dist: ruff; extra == "dev" + +# Overview + +| Developed by | Guardrails AI | +| --- | --- | +| Date of development | Feb 15, 2024 | +| Validator type | Format | +| Blog | | +| License | Apache 2 | +| Input/Output | Output | + +## Description + +### Intended Use +This validator is a template for creating other validators, but for demonstrative purposes it ensures that a generated output is the literal `pass`. + +### Requirements + +* Dependencies: + - guardrails-ai>=0.4.0 + +* Foundation model access keys: + - OPENAI_API_KEY + +## Installation + +```bash +$ guardrails hub install hub://guardrails/validator_template +``` + +## Usage Examples + +### Validating string output via Python + +In this example, we apply the validator to a string output generated by an LLM. + +```python +# Import Guard and Validator +from guardrails.hub import ValidatorTemplate +from guardrails import Guard + +# Setup Guard +guard = Guard().use( + ValidatorTemplate +) + +guard.validate("pass") # Validator passes +guard.validate("fail") # Validator fails +``` + +### Validating JSON output via Python + +In this example, we apply the validator to a string field of a JSON output generated by an LLM. + +```python +# Import Guard and Validator +from pydantic import BaseModel, Field +from guardrails.hub import ValidatorTemplate +from guardrails import Guard + +# Initialize Validator +val = ValidatorTemplate() + +# Create Pydantic BaseModel +class Process(BaseModel): + process_name: str + status: str = Field(validators=[val]) + +# Create a Guard to check for valid Pydantic output +guard = Guard.from_pydantic(output_class=Process) + +# Run LLM output generating JSON through guard +guard.parse(""" +{ + "process_name": "templating", + "status": "pass" +} +""") +``` + +# API Reference + +**`__init__(self, on_fail="noop")`** +
    +Initializes a new instance of the ValidatorTemplate class. + +**Parameters** +- **`arg_1`** *(str)*: A placeholder argument to demonstrate how to use init arguments. +- **`arg_2`** *(str)*: Another placeholder argument to demonstrate how to use init arguments. +- **`on_fail`** *(str, Callable)*: The policy to enact when a validator fails. If `str`, must be one of `reask`, `fix`, `filter`, `refrain`, `noop`, `exception` or `fix_reask`. Otherwise, must be a function that is called when the validator fails. +
+
+ +**`validate(self, value, metadata) -> ValidationResult`** +
    +Validates the given `value` using the rules defined in this validator, relying on the `metadata` provided to customize the validation process. This method is automatically invoked by `guard.parse(...)`, ensuring the validation logic is applied to the input data. + +Note: + +1. This method should not be called directly by the user. Instead, invoke `guard.parse(...)` where this method will be called internally for each associated Validator. +2. When invoking `guard.parse(...)`, ensure to pass the appropriate `metadata` dictionary that includes keys and values required by this validator. If `guard` is associated with multiple validators, combine all necessary metadata into a single dictionary. + +**Parameters** +- **`value`** *(Any)*: The input value to validate. +- **`metadata`** *(dict)*: A dictionary containing metadata required for validation. Keys and values must match the expectations of this validator. + + + | Key | Type | Description | Default | + | --- | --- | --- | --- | + | `key1` | String | Description of key1's role. | N/A | +
diff --git a/validator_template.egg-info/SOURCES.txt b/validator_template.egg-info/SOURCES.txt new file mode 100644 index 0000000..0406bc2 --- /dev/null +++ b/validator_template.egg-info/SOURCES.txt @@ -0,0 +1,13 @@ +LICENSE.txt +README.md +pyproject.toml +tests/test_validator.py +validator/__init__.py +validator/benchmark_context_relevancy_prompt.py +validator/benchmark_hallucination_prompt.py +validator/benchmark_qa_correctness_prompt.py +validator_template.egg-info/PKG-INFO +validator_template.egg-info/SOURCES.txt +validator_template.egg-info/dependency_links.txt +validator_template.egg-info/requires.txt +validator_template.egg-info/top_level.txt \ No newline at end of file diff --git a/validator_template.egg-info/dependency_links.txt b/validator_template.egg-info/dependency_links.txt new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/validator_template.egg-info/dependency_links.txt @@ -0,0 +1 @@ + diff --git a/validator_template.egg-info/requires.txt b/validator_template.egg-info/requires.txt new file mode 100644 index 0000000..f6c47e1 --- /dev/null +++ b/validator_template.egg-info/requires.txt @@ -0,0 +1,6 @@ +guardrails-ai>=0.4.0 + +[dev] +pyright +pytest +ruff diff --git a/validator_template.egg-info/top_level.txt b/validator_template.egg-info/top_level.txt new file mode 100644 index 0000000..07d07ff --- /dev/null +++ b/validator_template.egg-info/top_level.txt @@ -0,0 +1 @@ +validator From 8d0e30cad5d1dc0180865335785e95ad02cbfced Mon Sep 17 00:00:00 2001 From: Julia Gomes Date: Fri, 26 Jul 2024 15:20:21 -0700 Subject: [PATCH 7/8] Remove egg info --- validator_template.egg-info/PKG-INFO | 124 ------------------ validator_template.egg-info/SOURCES.txt | 13 -- .../dependency_links.txt | 1 - validator_template.egg-info/requires.txt | 6 - validator_template.egg-info/top_level.txt | 1 - 5 files changed, 145 deletions(-) delete mode 100644 validator_template.egg-info/PKG-INFO delete mode 100644 validator_template.egg-info/SOURCES.txt delete mode 100644 validator_template.egg-info/dependency_links.txt delete mode 100644 validator_template.egg-info/requires.txt delete mode 100644 validator_template.egg-info/top_level.txt diff --git a/validator_template.egg-info/PKG-INFO b/validator_template.egg-info/PKG-INFO deleted file mode 100644 index f49d811..0000000 --- a/validator_template.egg-info/PKG-INFO +++ /dev/null @@ -1,124 +0,0 @@ -Metadata-Version: 2.1 -Name: validator-template -Version: 0.0.0 -Summary: Template repo for Guardrails Hub validators. -Author-email: Guardrails AI -Requires-Python: >=3.8.1 -Description-Content-Type: text/markdown -License-File: LICENSE.txt -Requires-Dist: guardrails-ai>=0.4.0 -Provides-Extra: dev -Requires-Dist: pyright; extra == "dev" -Requires-Dist: pytest; extra == "dev" -Requires-Dist: ruff; extra == "dev" - -# Overview - -| Developed by | Guardrails AI | -| --- | --- | -| Date of development | Feb 15, 2024 | -| Validator type | Format | -| Blog | | -| License | Apache 2 | -| Input/Output | Output | - -## Description - -### Intended Use -This validator is a template for creating other validators, but for demonstrative purposes it ensures that a generated output is the literal `pass`. - -### Requirements - -* Dependencies: - - guardrails-ai>=0.4.0 - -* Foundation model access keys: - - OPENAI_API_KEY - -## Installation - -```bash -$ guardrails hub install hub://guardrails/validator_template -``` - -## Usage Examples - -### Validating string output via Python - -In this example, we apply the validator to a string output generated by an LLM. - -```python -# Import Guard and Validator -from guardrails.hub import ValidatorTemplate -from guardrails import Guard - -# Setup Guard -guard = Guard().use( - ValidatorTemplate -) - -guard.validate("pass") # Validator passes -guard.validate("fail") # Validator fails -``` - -### Validating JSON output via Python - -In this example, we apply the validator to a string field of a JSON output generated by an LLM. - -```python -# Import Guard and Validator -from pydantic import BaseModel, Field -from guardrails.hub import ValidatorTemplate -from guardrails import Guard - -# Initialize Validator -val = ValidatorTemplate() - -# Create Pydantic BaseModel -class Process(BaseModel): - process_name: str - status: str = Field(validators=[val]) - -# Create a Guard to check for valid Pydantic output -guard = Guard.from_pydantic(output_class=Process) - -# Run LLM output generating JSON through guard -guard.parse(""" -{ - "process_name": "templating", - "status": "pass" -} -""") -``` - -# API Reference - -**`__init__(self, on_fail="noop")`** -
    -Initializes a new instance of the ValidatorTemplate class. - -**Parameters** -- **`arg_1`** *(str)*: A placeholder argument to demonstrate how to use init arguments. -- **`arg_2`** *(str)*: Another placeholder argument to demonstrate how to use init arguments. -- **`on_fail`** *(str, Callable)*: The policy to enact when a validator fails. If `str`, must be one of `reask`, `fix`, `filter`, `refrain`, `noop`, `exception` or `fix_reask`. Otherwise, must be a function that is called when the validator fails. -
-
- -**`validate(self, value, metadata) -> ValidationResult`** -
    -Validates the given `value` using the rules defined in this validator, relying on the `metadata` provided to customize the validation process. This method is automatically invoked by `guard.parse(...)`, ensuring the validation logic is applied to the input data. - -Note: - -1. This method should not be called directly by the user. Instead, invoke `guard.parse(...)` where this method will be called internally for each associated Validator. -2. When invoking `guard.parse(...)`, ensure to pass the appropriate `metadata` dictionary that includes keys and values required by this validator. If `guard` is associated with multiple validators, combine all necessary metadata into a single dictionary. - -**Parameters** -- **`value`** *(Any)*: The input value to validate. -- **`metadata`** *(dict)*: A dictionary containing metadata required for validation. Keys and values must match the expectations of this validator. - - - | Key | Type | Description | Default | - | --- | --- | --- | --- | - | `key1` | String | Description of key1's role. | N/A | -
diff --git a/validator_template.egg-info/SOURCES.txt b/validator_template.egg-info/SOURCES.txt deleted file mode 100644 index 0406bc2..0000000 --- a/validator_template.egg-info/SOURCES.txt +++ /dev/null @@ -1,13 +0,0 @@ -LICENSE.txt -README.md -pyproject.toml -tests/test_validator.py -validator/__init__.py -validator/benchmark_context_relevancy_prompt.py -validator/benchmark_hallucination_prompt.py -validator/benchmark_qa_correctness_prompt.py -validator_template.egg-info/PKG-INFO -validator_template.egg-info/SOURCES.txt -validator_template.egg-info/dependency_links.txt -validator_template.egg-info/requires.txt -validator_template.egg-info/top_level.txt \ No newline at end of file diff --git a/validator_template.egg-info/dependency_links.txt b/validator_template.egg-info/dependency_links.txt deleted file mode 100644 index 8b13789..0000000 --- a/validator_template.egg-info/dependency_links.txt +++ /dev/null @@ -1 +0,0 @@ - diff --git a/validator_template.egg-info/requires.txt b/validator_template.egg-info/requires.txt deleted file mode 100644 index f6c47e1..0000000 --- a/validator_template.egg-info/requires.txt +++ /dev/null @@ -1,6 +0,0 @@ -guardrails-ai>=0.4.0 - -[dev] -pyright -pytest -ruff diff --git a/validator_template.egg-info/top_level.txt b/validator_template.egg-info/top_level.txt deleted file mode 100644 index 07d07ff..0000000 --- a/validator_template.egg-info/top_level.txt +++ /dev/null @@ -1 +0,0 @@ -validator From ae9641e6f2af34795275cc31dd646676b5137bcd Mon Sep 17 00:00:00 2001 From: Julia Gomes Date: Fri, 26 Jul 2024 15:21:26 -0700 Subject: [PATCH 8/8] Remove build directory --- build/lib/validator/__init__.py | 3 - .../benchmark_context_relevancy_prompt.py | 143 ----------------- .../benchmark_hallucination_prompt.py | 147 ----------------- .../benchmark_qa_correctness_prompt.py | 149 ------------------ 4 files changed, 442 deletions(-) delete mode 100644 build/lib/validator/__init__.py delete mode 100644 build/lib/validator/benchmark_context_relevancy_prompt.py delete mode 100644 build/lib/validator/benchmark_hallucination_prompt.py delete mode 100644 build/lib/validator/benchmark_qa_correctness_prompt.py diff --git a/build/lib/validator/__init__.py b/build/lib/validator/__init__.py deleted file mode 100644 index 72a2623..0000000 --- a/build/lib/validator/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -from .main import ValidatorTemplate - -__all__ = ["ValidatorTemplate"] diff --git a/build/lib/validator/benchmark_context_relevancy_prompt.py b/build/lib/validator/benchmark_context_relevancy_prompt.py deleted file mode 100644 index cce71ff..0000000 --- a/build/lib/validator/benchmark_context_relevancy_prompt.py +++ /dev/null @@ -1,143 +0,0 @@ -"""Script to evaluate Context Relevancy Guard on "wiki_qa-train" benchmark dataset. -* https://huggingface.co/datasets/microsoft/wiki_qa - -Model: gpt-4o-mini -Guard Results - precision recall f1-score support - - relevant 0.70 0.86 0.77 93 - unrelated 0.85 0.68 0.76 107 - - accuracy 0.77 200 - macro avg 0.78 0.77 0.76 200 -weighted avg 0.78 0.77 0.76 200 - -Latency -count 200.000000 -mean 2.812122 -std 1.753805 -min 1.067620 -25% 1.708051 -50% 2.248962 -75% 3.321251 -max 14.102804 -Name: guard_latency_gpt-4o-mini, dtype: float64 -median latency -2.2489616039965767 - -Model: gpt-4-turbo -Guard Results - precision recall f1-score support - - relevant 0.64 0.90 0.75 93 - unrelated 0.87 0.56 0.68 107 - - accuracy 0.72 200 - macro avg 0.76 0.73 0.72 200 -weighted avg 0.76 0.72 0.71 200 - -Latency -count 200.000000 -mean 8.561413 -std 6.425799 -min 1.624563 -25% 3.957226 -50% 5.979291 -75% 11.579224 -max 34.342637 -Name: guard_latency_gpt-4-turbo, dtype: float64 -median latency -5.979290812509134 -""" -import os -import time -from getpass import getpass -from typing import List, Tuple - -import openai -import pandas as pd -from sklearn.metrics import classification_report - -from guardrails import Guard -from main import ContextRelevancyPrompt, LlmRagEvaluator -from phoenix.evals import download_benchmark_dataset -from sklearn.utils import shuffle - - -RANDOM_STATE = 119 -MODELS = ["gpt-4o-mini", "gpt-4-turbo"] -N_EVAL_SAMPLE_SIZE = 200 -SAVE_RESULTS_PATH = "context_relevancy_guard_results.csv" - - -def evaluate_guard_on_dataset(test_dataset: pd.DataFrame, guard: Guard, model: str) -> Tuple[List[float], List[bool]]: - """Evaluate guard on benchmark dataset. - - :param test_dataset: Dataframe of test examples. - :param guard: Guard we want to evaluate. - - :return: Tuple where the first lists contains latency, and the second list contains a boolean indicating whether the guard passed. - """ - latency_measurements = [] - guard_passed = [] - for _, rag_example in test_dataset.iterrows(): - start_time = time.perf_counter() - response = guard( - llm_api=openai.chat.completions.create, - prompt=rag_example["query_text"], - model=model, - max_tokens=1024, - temperature=0.5, - metadata={ - "user_message": rag_example["query_text"], - "context": rag_example["document_text"], - } - ) - latency_measurements.append(time.perf_counter() - start_time) - guard_passed.append(response.validation_passed) - return latency_measurements, guard_passed - - -if __name__ == "__main__": - if not (openai_api_key := os.getenv("OPENAI_API_KEY")): - openai_api_key = getpass("🔑 Enter your OpenAI API key: ") - openai.api_key = openai_api_key - os.environ["OPENAI_API_KEY"] = openai_api_key - - # Columns: Index(['query_id', 'query_text', 'document_title', 'document_text', 'document_text_with_emphasis', 'relevant'] - test_dataset = download_benchmark_dataset( - task="binary-relevance-classification", - dataset_name="wiki_qa-train") - test_dataset = shuffle(test_dataset, random_state=RANDOM_STATE) - test_dataset = test_dataset[:N_EVAL_SAMPLE_SIZE] - - for model in MODELS: - guard = Guard.from_string( - validators=[ - LlmRagEvaluator( - eval_llm_prompt_generator=ContextRelevancyPrompt(prompt_name="context_relevancy_judge_llm"), - llm_evaluator_fail_response="unrelated", - llm_evaluator_pass_response="relevant", - llm_callable=model, - on_fail="noop", - on="prompt") - ], - ) - - latency_measurements, guard_passed = evaluate_guard_on_dataset(test_dataset=test_dataset, guard=guard, model=model) - test_dataset[f"guard_passed_{model}"] = guard_passed - test_dataset[f"guard_latency_{model}"] = latency_measurements - - print(f"\nModel: {model}") - print("Guard Results") - # Calculate precision, recall and f1-score for when the Guard fails (e.g. flags an irrelevant answer) - print(classification_report( - test_dataset["relevant"].replace(True, "relevant").replace(False, "unrelated"), - test_dataset[f"guard_passed_{model}"].replace(True, "relevant").replace(False, "unrelated"))) - print("Latency") - print(test_dataset[f"guard_latency_{model}"].describe()) - print("median latency") - print(test_dataset[f"guard_latency_{model}"].median()) - - if SAVE_RESULTS_PATH: - test_dataset.to_csv(SAVE_RESULTS_PATH) diff --git a/build/lib/validator/benchmark_hallucination_prompt.py b/build/lib/validator/benchmark_hallucination_prompt.py deleted file mode 100644 index bf68f7b..0000000 --- a/build/lib/validator/benchmark_hallucination_prompt.py +++ /dev/null @@ -1,147 +0,0 @@ -"""Script to evaluate Hallucination Guard on benchmark dataset. -Currently supported datasets include "halueval_qa_data" from the HaluEval benchmark: -* https://arxiv.org/abs/2305.11747 -* https://github.com/RUCAIBox/HaluEval - -Model: gpt-4o-mini -Guard Results - precision recall f1-score support - - factual 0.79 0.97 0.87 129 -hallucinated 0.96 0.73 0.83 121 - - accuracy 0.85 250 - macro avg 0.87 0.85 0.85 250 -weighted avg 0.87 0.85 0.85 250 - -Latency -count 250.000000 -mean 1.865513 -std 0.603700 -min 1.139974 -25% 1.531160 -50% 1.758210 -75% 2.026153 -max 6.403010 -Name: guard_latency_gpt-4o-mini, dtype: float64 -median latency -1.7582097915001214 - -Model: gpt-4-turbo -Guard Results - precision recall f1-score support - - factual 0.83 0.88 0.85 129 -hallucinated 0.87 0.80 0.83 121 - - accuracy 0.84 250 - macro avg 0.85 0.84 0.84 250 -weighted avg 0.85 0.84 0.84 250 - -Latency -count 250.000000 -mean 4.295613 -std 2.393394 -min 1.460899 -25% 2.868255 -50% 3.724649 -75% 4.939440 -max 23.465773 -Name: guard_latency_gpt-4-turbo, dtype: float64 -median latency -3.724648874514969 -""" -import os -import time -from getpass import getpass -from typing import List, Tuple - -import openai -import pandas as pd -from sklearn.metrics import classification_report -from sklearn.utils import shuffle - -from guardrails import Guard -from main import HallucinationPrompt, LlmRagEvaluator -from phoenix.evals import download_benchmark_dataset - - -RANDOM_STATE = 119 -MODELS = ["gpt-4o-mini", "gpt-4-turbo"] -N_EVAL_SAMPLE_SIZE = 250 -SAVE_RESULTS_PATH = "hallucination_guard_results.csv" - - -def evaluate_guard_on_dataset(test_dataset: pd.DataFrame, guard: Guard, model: str) -> Tuple[List[float], List[bool]]: - """Evaluate guard on benchmark dataset. - - :param test_dataset: Dataframe of test examples. - :param guard: Guard we want to evaluate. - - :return: Tuple where the first lists contains latency, and the second list contains a boolean indicating whether the guard passed. - """ - latency_measurements = [] - guard_passed = [] - for _, rag_example in test_dataset.iterrows(): - start_time = time.perf_counter() - response = guard( - llm_api=openai.chat.completions.create, - prompt=rag_example["query"], - model=model, - max_tokens=1024, - temperature=0.5, - metadata={ - "user_message": rag_example["query"], - "context": rag_example["reference"], - "llm_response": rag_example["response"], - } - ) - latency_measurements.append(time.perf_counter() - start_time) - guard_passed.append(response.validation_passed) - return latency_measurements, guard_passed - - -if __name__ == "__main__": - if not (openai_api_key := os.getenv("OPENAI_API_KEY")): - openai_api_key = getpass("🔑 Enter your OpenAI API key: ") - openai.api_key = openai_api_key - os.environ["OPENAI_API_KEY"] = openai_api_key - - # Columns: ['reference', 'query', 'response', 'is_hallucination'] - test_dataset = download_benchmark_dataset( - task="binary-hallucination-classification", - dataset_name="halueval_qa_data") - test_dataset = shuffle(test_dataset, random_state=119) - test_dataset = test_dataset[:N_EVAL_SAMPLE_SIZE] - - for model in MODELS: - guard = Guard.from_string( - validators=[ - LlmRagEvaluator( - eval_llm_prompt_generator=HallucinationPrompt(prompt_name="hallucination_judge_llm"), - llm_evaluator_fail_response="hallucinated", - llm_evaluator_pass_response="factual", - llm_callable=model, - on_fail="noop", - on="prompt") - ], - ) - - latency_measurements, guard_passed = evaluate_guard_on_dataset(test_dataset=test_dataset, guard=guard, model=model) - test_dataset[f"guard_passed_{model}"] = guard_passed - test_dataset[f"guard_latency_{model}"] = latency_measurements - - print(f"\nModel: {model}") - print("Guard Results") - # Calculate precision, recall and f1-score for when the Guard fails (e.g. flags a hallucination) - print(classification_report( - test_dataset["is_hallucination"].replace(True, "hallucinated").replace(False, "factual"), - test_dataset[f"guard_passed_{model}"].replace(True, "factual").replace(False, "hallucinated"))) - - print("Latency") - print(test_dataset[f"guard_latency_{model}"].describe()) - print("median latency") - print(test_dataset[f"guard_latency_{model}"].median()) - - if SAVE_RESULTS_PATH: - test_dataset.to_csv(SAVE_RESULTS_PATH) diff --git a/build/lib/validator/benchmark_qa_correctness_prompt.py b/build/lib/validator/benchmark_qa_correctness_prompt.py deleted file mode 100644 index f821527..0000000 --- a/build/lib/validator/benchmark_qa_correctness_prompt.py +++ /dev/null @@ -1,149 +0,0 @@ -"""Script to evaluate QA Correctness Guard on benchmark dataset. -The 2.0 version of the large-scale dataset Stanford Question Answering Dataset (SQuAD 2.0) allows -researchers to design AI models for reading comprehension tasks under challenging constraints. -https://web.stanford.edu/class/archive/cs/cs224n/cs224n.1194/reports/default/15785042.pdf - -Model: gpt-4o-mini - -Guard Results - precision recall f1-score support - - correct 1.00 0.96 0.98 133 - incorrect 0.96 1.00 0.98 117 - - accuracy 0.98 250 - macro avg 0.98 0.98 0.98 250 -weighted avg 0.98 0.98 0.98 250 - -Latency -count 250.000000 -mean 2.610912 -std 1.415877 -min 1.148114 -25% 1.678278 -50% 2.263149 -75% 2.916726 -max 10.625763 -Name: guard_latency_gpt-4o-mini, dtype: float64 -median latency -2.263148645986803 - -Model: gpt-4-turbo - -Guard Results - precision recall f1-score support - - correct 1.00 0.92 0.96 133 - incorrect 0.91 1.00 0.96 117 - - accuracy 0.96 250 - macro avg 0.96 0.96 0.96 250 -weighted avg 0.96 0.96 0.96 250 - -Latency -count 250.000000 -mean 7.390556 -std 5.804535 -min 1.671949 -25% 3.544383 -50% 5.239343 -75% 8.484112 -max 30.651372 -Name: guard_latency_gpt-4-turbo, dtype: float64 -median latency -5.239343083492713 -""" -import os -import time -from getpass import getpass -from typing import List, Tuple - -import openai -import pandas as pd -from sklearn.metrics import classification_report - -from guardrails import Guard -from main import QACorrectnessPrompt, LlmRagEvaluator -from phoenix.evals import download_benchmark_dataset -from sklearn.utils import shuffle - - -RANDOM_STATE = 119 -MODELS = ["gpt-4o-mini", "gpt-4-turbo"] -N_EVAL_SAMPLE_SIZE = 250 -SAVE_RESULTS_PATH = "qa_correctness_guard_results.csv" - - -def evaluate_guard_on_dataset(test_dataset: pd.DataFrame, guard: Guard, model: str) -> Tuple[List[float], List[bool]]: - """Evaluate guard on benchmark dataset. - - :param test_dataset: Dataframe of test examples. - :param guard: Guard we want to evaluate. - - :return: Tuple where the first lists contains latency, and the second list contains a boolean indicating whether the guard passed. - """ - latency_measurements = [] - guard_passed = [] - for _, rag_example in test_dataset.iterrows(): - start_time = time.perf_counter() - response = guard( - llm_api=openai.chat.completions.create, - prompt=rag_example["question"], - model=model, - max_tokens=1024, - temperature=0.5, - metadata={ - "user_message": rag_example["question"], - "context": rag_example["context"], - "llm_response": rag_example["sampled_answer"], - } - ) - latency_measurements.append(time.perf_counter() - start_time) - guard_passed.append(response.validation_passed) - return latency_measurements, guard_passed - - -if __name__ == "__main__": - if not (openai_api_key := os.getenv("OPENAI_API_KEY")): - openai_api_key = getpass("🔑 Enter your OpenAI API key: ") - openai.api_key = openai_api_key - os.environ["OPENAI_API_KEY"] = openai_api_key - - # Columns: Index(['id', 'title', 'context', 'question', 'answers', 'correct_answer', 'wrong_answer', 'sampled_answer', 'answer_true'] - test_dataset = df = download_benchmark_dataset( - task="qa-classification", - dataset_name="qa_generated_dataset") - test_dataset = shuffle(test_dataset, random_state=RANDOM_STATE) - test_dataset = test_dataset[:N_EVAL_SAMPLE_SIZE] - - for model in MODELS: - guard = Guard.from_string( - validators=[ - LlmRagEvaluator( - eval_llm_prompt_generator=QACorrectnessPrompt(prompt_name="qa_correctness_judge_llm"), - llm_evaluator_fail_response="incorrect", - llm_evaluator_pass_response="correct", - llm_callable=model, - on_fail="noop", - on="prompt") - ], - ) - - latency_measurements, guard_passed = evaluate_guard_on_dataset(test_dataset=test_dataset, guard=guard, model=model) - test_dataset[f"guard_passed_{model}"] = guard_passed - test_dataset[f"guard_latency_{model}"] = latency_measurements - - print(f"\nModel: {model}") - print("\nGuard Results") - # Calculate precision, recall and f1-score for when the Guard fails (e.g. flags an incorrect answer) - print(classification_report( - test_dataset["answer_true"].replace(True, "correct").replace(False, "incorrect"), - test_dataset[f"guard_passed_{model}"].replace(True, "correct").replace(False, "incorrect"))) - - print("Latency") - print(test_dataset[f"guard_latency_{model}"].describe()) - print("median latency") - print(test_dataset[f"guard_latency_{model}"].median()) - - if SAVE_RESULTS_PATH: - test_dataset.to_csv(SAVE_RESULTS_PATH)