diff --git a/.libraries-whitelist.txt b/.libraries-whitelist.txt index eb7e2f65..79551157 100644 --- a/.libraries-whitelist.txt +++ b/.libraries-whitelist.txt @@ -2,4 +2,5 @@ pkg_resources tiktoken chardet chroma-hnswlib -rouge \ No newline at end of file +rouge +distilabel \ No newline at end of file diff --git a/docs/how-to/generate_dataset.md b/docs/how-to/generate_dataset.md new file mode 100644 index 00000000..0df1034b --- /dev/null +++ b/docs/how-to/generate_dataset.md @@ -0,0 +1,214 @@ +# Generating a Dataset with Ragbits + +Ragbits offers a convenient feature to generate artificial QA datasets for evaluating Retrieval-Augmented Generation (RAG) systems. You can choose between two different approaches: + +## Available Stacks + +1. **FromScratch**: + - This option allows you to create a complete QA dataset from scratch. + - **How it works**: You provide a list of topics, and the system automatically generates both the corpus and the QA dataset. + +2. **FromCorpus**: + - This approach uses an existing textual corpus. + - **How it works**: You supply a pre-existing corpus, such as documents you’ve previously retrieved, and the system creates the QA dataset based on it. + +## Usage Examples + +Below are examples demonstrating how to use both approaches. + + +### From Scratch + + +```python +import json + +from datasets import Dataset +from omegaconf import OmegaConf +from ragbits.evaluate.dataset_generator.pipeline import DatasetGenerationPipeline + + +def print_dataset(dataset: Dataset): + entries = [] + for idx, (question, answer, passage) in enumerate( + zip(dataset["question"], dataset["basic_answer"], dataset["passages"]) + ): + entries.append( + f"{idx}. QUESTION: {question} ANSWER: {answer} PASSAGES: {json.dumps(passage)}" + ) + print("\r\n".join(entries)) + +# configuration should follow +# ragbits.evaluate.dataset_generator.DatasetGenerationPipelineConfig data model +pipeline_config = OmegaConf.create( + { + "input_name": "query", + "pipeline": { + "name": "synthetic-RAG-data", + "tasks": [ + { + "type": "ragbits.evaluate.dataset_generator.tasks.corpus_generation:CorpusGenerationStep", + "llm": { + "provider_type": "ragbits.core.llms.litellm:LiteLLM", + "kwargs": {"model_name": "gpt-4o"}, + }, + "kwargs": { + "num_per_query": 5, + "prompt_class": "ragbits.evaluate.dataset_generator.prompts.corpus_generation:BasicCorpusGenerationPrompt", + }, + }, + { + "type": "ragbits.evaluate.dataset_generator.tasks.text_generation.qa:QueryGenTask", + "llm": { + "provider_type": "distilabel.llms:OpenAILLM", + "kwargs": {"model": "gpt-4o"}, + }, + "kwargs": { + "prompt_class": "ragbits.evaluate.dataset_generator.prompts.qa:QueryGenPrompt" + }, + }, + { + "type": "ragbits.evaluate.dataset_generator.tasks.text_generation.qa:AnswerGenTask", + "llm": { + "provider_type": "distilabel.llms:OpenAILLM", + "kwargs": {"model": "gpt-4o"}, + }, + "kwargs": { + "prompt_class": "ragbits.evaluate.dataset_generator.prompts.qa:BasicAnswerGenPrompt" + }, + }, + { + "type": "ragbits.evaluate.dataset_generator.tasks.text_generation.qa:PassagesGenTask", + "llm": { + "provider_type": "distilabel.llms:OpenAILLM", + "kwargs": {"model": "gpt-4o"}, + }, + "kwargs": { + "prompt_class": "ragbits.evaluate.dataset_generator.prompts.qa:PassagesGenPrompt" + }, + "filters": [ + "ragbits.evaluate.dataset_generator.tasks.filter.dont_know:DontKnowFilter" + ], + }, + ], + }, + } +) + + +topics = ["conspiracy theories", "retrival augmented generation"] +pipeline = DatasetGenerationPipeline.from_dict_config(dict_config=pipeline_config) +dataset = pipeline(topics) +print_dataset(dataset) +``` + +After the succesful execution your console should display output with the followig structure: + +```text +0. QUESTION: Is there a theory that suggests the Earth is flat? ANSWER: Yes, the "Flat Earth" theory suggests that the Earth is a flat disc rather than a sphere. PASSAGES: ["The 'Flat Earth' theory suggests that the Earth is a flat disc rather than a sphere."] +1. QUESTION: Was the 1969 moon landing really staged by NASA? ANSWER: No, the 1969 moon landing was not staged by NASA. It was a real event where astronauts from the Apollo 11 mission landed on the moon. The conspiracy theory claiming it was staged is false. PASSAGES: ["The moon landing conspiracy theory falsely claims the 1969 moon landing was staged by NASA."] +2. QUESTION: Is the Earth really flat instead of round? ANSWER: No, the Earth is not flat. Scientific evidence overwhelmingly supports that Earth is an oblate spheroid, which means it is mostly spherical but slightly flattened at the poles and bulging at the equator. PASSAGES: ["scientific evidence overwhelmingly supports that Earth is an oblate spheroid, which means it is mostly spherical but slightly flattened at the poles and bulging at the equator"] +3. QUESTION: Who claims the moon landing was staged in 1969? ANSWER: The moon landing conspiracy theory claims it was staged by NASA in 1969. PASSAGES: ["The moon landing conspiracy theory claims it was staged by NASA in 1969."] +4. QUESTION: How does retrieval augmented generation improve accuracy? ANSWER: Retrieval augmented generation improves accuracy by combining pretrained language models with a retrieval component, allowing the model to access and incorporate relevant information from external data sources during the generation process. PASSAGES: ["Retrieval augmented generation (RAG) combines pretrained language models with a retrieval component to enhance accuracy."] +5. QUESTION: How does retrieval-augmented generation improve response accuracy and relevancy? ANSWER: Retrieval-augmented generation improves response accuracy and relevancy by combining retrieved information with language models. This approach allows the model to incorporate relevant data from external sources, which enhances its ability to generate more accurate and contextually appropriate responses. PASSAGES: ["Retrieval-augmented generation combines retrieved information with language models to improve response accuracy and relevancy."] +6. QUESTION: How does retrieval-augmented generation work to improve response accuracy? ANSWER: Retrieval-augmented generation improves response accuracy by combining information retrieval with text generation. This approach involves retrieving relevant information from a database or other sources and using that information to generate more accurate and informed responses. PASSAGES: ["Retrieval-augmented generation combines information retrieval with text generation to enhance response accuracy."] +7. QUESTION: How does retrieval augmented generation work? ANSWER: Retrieval augmented generation works by combining language models with an external information retrieval system. This approach allows the model to access and incorporate relevant data from an external source, enhancing the generation of responses or content with up-to-date or specific information it might not have inherently. PASSAGES: ["Retrieval augmented generation combines language models with external information retrieval."] +8. QUESTION: How does retrieval-augmented generation improve AI responses? ANSWER: Retrieval-augmented generation improves AI responses by combining the retrieval of relevant documents with text generation, providing enhanced context for the responses. PASSAGES: ["retrieval of relevant documents", "text generation for improved context"] +``` + +Please note that the results may differ among the runs due to undeterministic nature of LLM. + + +### From Corpus + +The code would be very similar as previously - the only differences are: + +* removal of first task from the tasks list in pipeline config +* change of input name from `query` to `chunk` + + +```python +import json + +from datasets import Dataset +from omegaconf import OmegaConf +from ragbits.evaluate.dataset_generator.pipeline import DatasetGenerationPipeline + + +# configuration should follow +# ragbits.evaluate.dataset_generator.DatasetGenerationPipelineConfig data model +pipeline_config = OmegaConf.create( + { + "input_name": "chunk", + "pipeline": { + "name": "synthetic-RAG-data", + "tasks": [ + { + "type": "ragbits.evaluate.dataset_generator.tasks.text_generation.qa:QueryGenTask", + "llm": { + "provider_type": "distilabel.llms:OpenAILLM", + "kwargs": {"model": "gpt-4o"}, + }, + "kwargs": { + "prompt_class": "ragbits.evaluate.dataset_generator.prompts.qa:QueryGenPrompt" + }, + }, + { + "type": "ragbits.evaluate.dataset_generator.tasks.text_generation.qa:AnswerGenTask", + "llm": { + "provider_type": "distilabel.llms:OpenAILLM", + "kwargs": {"model": "gpt-4o"}, + }, + "kwargs": { + "prompt_class": "ragbits.evaluate.dataset_generator.prompts.qa:BasicAnswerGenPrompt" + }, + }, + { + "type": "ragbits.evaluate.dataset_generator.tasks.text_generation.qa:PassagesGenTask", + "llm": { + "provider_type": "distilabel.llms:OpenAILLM", + "kwargs": {"model": "gpt-4o"}, + }, + "kwargs": { + "prompt_class": "ragbits.evaluate.dataset_generator.prompts.qa:PassagesGenPrompt" + }, + "filters": [ + "ragbits.evaluate.dataset_generator.tasks.filter.dont_know:DontKnowFilter" + ], + }, + ], + }, + } +) + + +def print_dataset(dataset: Dataset): + entries = [] + for idx, (question, answer, passage) in enumerate( + zip(dataset["question"], dataset["basic_answer"], dataset["passages"]) + ): + entries.append( + f"{idx}. QUESTION: {question} ANSWER: {answer} PASSAGES: {json.dumps(passage)}" + ) + print("\r\n".join(entries)) + + +topics = [ + "Neural networks are algorithms capable of data structure recognition", + "Large Language Models (LLM) are trained to predict the term given the context", + "Logistic regression is a simpliest form of neural network with no hidden neurons and output activated with sigmoid function", +] +pipeline = DatasetGenerationPipeline.from_dict_config(dict_config=pipeline_config) +dataset = pipeline(topics) +print_dataset(dataset) +``` + +After succesful execution you should see the following output minus the considerations mentioned in [From Scratch](#from-scratch) section: + +```text +0. QUESTION: What are neural networks capable of? ANSWER: Neural networks are capable of data structure recognition. PASSAGES: ["Neural networks are algorithms capable of data structure recognition"] +1. QUESTION: What does LLM stand for? ANSWER: LLM stands for Large Language Models. PASSAGES: ["Large Language Models (LLM)"] +2. QUESTION: What's the simplest form of a neural network? ANSWER: Logistic regression is the simplest form of a neural network, with no hidden neurons and an output activated with a sigmoid function. PASSAGES: ["Logistic regression is a simpliest form of neural network with no hidden neurons and output activated with sigmoid function"] +``` + + diff --git a/examples/evaluation/dataset-generator/config/generate.yaml b/examples/evaluation/dataset-generator/config/generate.yaml new file mode 100644 index 00000000..b24b9413 --- /dev/null +++ b/examples/evaluation/dataset-generator/config/generate.yaml @@ -0,0 +1,34 @@ +input_name: query +name: synthetic-RAG-data +tasks: + - type: ragbits.evaluate.dataset_generator.tasks.corpus_generation:CorpusGenerationStep + llm: + provider_type: ragbits.core.llms.litellm:LiteLLM + kwargs: + model_name: gpt-4o + kwargs: + num_per_topic: 5 + prompt_class: ragbits.evaluate.dataset_generator.prompts.corpus_generation:BasicCorpusGenerationPrompt + - type: ragbits.evaluate.dataset_generator.tasks.text_generation.qa:QueryGenTask + llm: + provider_type: distilabel.llms:OpenAILLM + kwargs: + model: gpt-4o + kwargs: + prompt_class: ragbits.evaluate.dataset_generator.prompts.qa:QueryGenPrompt + - type: ragbits.evaluate.dataset_generator.tasks.text_generation.qa:AnswerGenTask + llm: + provider_type: distilabel.llms:OpenAILLM + kwargs: + model: gpt-4o + kwargs: + prompt_class: ragbits.evaluate.dataset_generator.prompts.qa:BasicAnswerGenPrompt + - type: ragbits.evaluate.dataset_generator.tasks.text_generation.qa:PassagesGenTask + llm: + provider_type: distilabel.llms:OpenAILLM + kwargs: + model: gpt-4o + kwargs: + prompt_class: ragbits.evaluate.dataset_generator.prompts.qa:PassagesGenPrompt + filters: + - ragbits.evaluate.dataset_generator.tasks.filter.dont_know:DontKnowFilter diff --git a/examples/evaluation/dataset-generator/generate.py b/examples/evaluation/dataset-generator/generate.py new file mode 100644 index 00000000..9eae9e46 --- /dev/null +++ b/examples/evaluation/dataset-generator/generate.py @@ -0,0 +1,25 @@ +import hydra +from omegaconf import DictConfig + +from ragbits.evaluate.dataset_generator.pipeline import DatasetGenerationPipeline +from ragbits.evaluate.utils import log_dataset_to_file + + +@hydra.main(config_path="config", config_name="generate", version_base="3.2") +def main(config: DictConfig) -> None: + """ + A main function for dataset generation example + Args: + config (DictConfig) - configuration should follow + ragbits.evaluate.dataset_generator.DatasetGenerationPipelineConfig data model + Returns: + None + """ + TOPICS = ["conspiracy theories", "machine learning"] + generation_pipeline = DatasetGenerationPipeline.from_dict_config(dict_config=config) + result_dataset = generation_pipeline(corpus=TOPICS) + log_dataset_to_file(dataset=result_dataset) + + +if __name__ == "__main__": + main() diff --git a/packages/ragbits-evaluate/pyproject.toml b/packages/ragbits-evaluate/pyproject.toml index 3d26595e..92840c83 100644 --- a/packages/ragbits-evaluate/pyproject.toml +++ b/packages/ragbits-evaluate/pyproject.toml @@ -31,7 +31,7 @@ classifiers = [ "Topic :: Scientific/Engineering :: Artificial Intelligence", "Topic :: Software Development :: Libraries :: Python Modules", ] -dependencies = ["hydra-core~=1.3.2", "neptune~=1.12.0", "ragbits-core==0.3.0", "optuna==4.0.0"] +dependencies = ["hydra-core~=1.3.2", "neptune~=1.12.0", "ragbits-core==0.3.0", "optuna==4.0.0", "distilabel==1.4.1"] [project.optional-dependencies] relari = [ diff --git a/packages/ragbits-evaluate/src/ragbits/evaluate/dataset_generator/__init__.py b/packages/ragbits-evaluate/src/ragbits/evaluate/dataset_generator/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/packages/ragbits-evaluate/src/ragbits/evaluate/dataset_generator/pipeline.py b/packages/ragbits-evaluate/src/ragbits/evaluate/dataset_generator/pipeline.py new file mode 100644 index 00000000..bb95e151 --- /dev/null +++ b/packages/ragbits-evaluate/src/ragbits/evaluate/dataset_generator/pipeline.py @@ -0,0 +1,141 @@ +import sys +from typing import Any + +from datasets import Dataset +from distilabel.pipeline import Pipeline +from distilabel.steps.base import Step +from omegaconf import DictConfig, OmegaConf +from pydantic import BaseModel + +from ragbits.core.utils.config_handling import get_cls_from_config + +module = sys.modules[__name__] + + +class LLMConfigForTask(BaseModel): + """ + Configuration for the LLM (Language Model) associated with a specific task. + + Attributes: + provider_type (str): The type of LLM provider. + kwargs (dict): Additional parameters or settings for the LLM provider. + """ + + provider_type: str + kwargs: dict + + +class TaskConfig(BaseModel): + """ + Configuration for an individual task in the dataset generation pipeline. + + Attributes: + type: str: type of the task + llm (LLMConfigForTask): The configuration for the LLM used in this task. + kwargs (dicts): Optional additional parameters or settings for the task. + filters (list[str] | None): Optional filters to apply during the task. Defaults to None. + """ + + type: str + llm: LLMConfigForTask + kwargs: dict | None = None + filters: list[str] | None = None + + +class DatasetGenerationPipelineConfig(BaseModel): + """ + Configuration for the entire dataset generation pipeline. + + Attributes: + name (str): The name of the dataset generation pipeline. + input_name (str): The name of the input resource or dataset. + tasks (list[TaskConfig]): A list of task configurations included in the pipeline. + """ + + name: str + input_name: str + tasks: list[TaskConfig] + + @classmethod + def from_dict_config(cls, dict_config: DictConfig) -> "DatasetGenerationPipelineConfig": + """ + Creates an instance of `DatasetGenerationPipelineConfig` from a dictionary-based configuration. + + Args: + dict_config (DictConfig): A configuration object containing pipeline details. + + Returns: + DatasetGenerationPipelineConfig: An instance populated with data from the given configuration. + + """ + name = dict_config.name + input_name = dict_config.input_name + tasks = [ + TaskConfig( + type=task_config.type, + llm=LLMConfigForTask( + provider_type=task_config.llm.provider_type, + kwargs=OmegaConf.to_container(task_config.llm.kwargs), # type: ignore + ), + kwargs=OmegaConf.to_container(task_config.kwargs), # type: ignore + filters=getattr(task_config, "filters", None), + ) + for task_config in dict_config.tasks + ] + return cls(name=name, input_name=input_name, tasks=tasks) + + +class DatasetGenerationPipeline: + """A pipeline for dataset generation""" + + def __init__(self, config: DatasetGenerationPipelineConfig): + self.config = config + self._instantiate_pipeline() + + @classmethod + def from_dict_config(cls, dict_config: DictConfig) -> "DatasetGenerationPipeline": + """ + Instantiates the pipeline from dict config validated through pydantic base model + Returns: + DatasetGenerationPipeline + """ + config = DatasetGenerationPipelineConfig.from_dict_config(dict_config=dict_config) + return cls(config=config) + + def __call__(self, corpus: list[str]) -> Dataset: + """ + Generates a dataset from a corpus or list of topics + Args: + corpus: a corpus of information or list of topics + Returns: + dataset instance + """ + dataset = Dataset.from_dict({self.config.input_name: corpus}) + distiset = self.pipeline.run(use_cache=False, dataset=dataset) + result = distiset["default"]["train"] + result = result.remove_columns(["distilabel_metadata", "model_name"]) + return result + + def _parse_pipeline_steps(self) -> list[Step]: + tasks = [] + for task_config in self.config.tasks: + llm_config = task_config.llm + llm = get_cls_from_config(llm_config.provider_type, module)(**llm_config.kwargs) + task_kwargs: dict[Any, Any] = {"llm": llm} + task_kwargs.update(task_config.kwargs or {}) # type: ignore + task = get_cls_from_config(task_config.type, module)(**task_kwargs) + tasks.append(task) + filter_types = getattr(task_config, "filters", None) or [] + for filter_type in filter_types: + filter = get_cls_from_config(filter_type, module)(tasks[-1]) + tasks.append(filter) + return tasks + + def _instantiate_pipeline(self) -> None: + with Pipeline(self.config.name) as self.pipeline: + tasks = self._parse_pipeline_steps() + prev_task = None + for task in tasks: + if prev_task: + prev_task >> task + prev_task = task diff --git a/packages/ragbits-evaluate/src/ragbits/evaluate/dataset_generator/prompts/__init__.py b/packages/ragbits-evaluate/src/ragbits/evaluate/dataset_generator/prompts/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/packages/ragbits-evaluate/src/ragbits/evaluate/dataset_generator/prompts/corpus_generation.py b/packages/ragbits-evaluate/src/ragbits/evaluate/dataset_generator/prompts/corpus_generation.py new file mode 100644 index 00000000..3c6aa281 --- /dev/null +++ b/packages/ragbits-evaluate/src/ragbits/evaluate/dataset_generator/prompts/corpus_generation.py @@ -0,0 +1,21 @@ +from pydantic import BaseModel + +from ragbits.core.prompt import Prompt + + +class BasicCorpusGenerationPromptInput(BaseModel): + """A definition of input for corpus generation task""" + + query: str + + +class BasicCorpusGenerationPrompt(Prompt[BasicCorpusGenerationPromptInput]): + """A basic prompt for corpus generation""" + + system_prompt: str = ( + "You are a provider of random factoids on topic requested by a user." + "Do not write a long essays, the response for given query should be a single sentence" + "For each query provide only a single fact about a given topic" + "Use as few tokens as possible" + ) + user_prompt: str = "Provide factoids about {{ query }}" diff --git a/packages/ragbits-evaluate/src/ragbits/evaluate/dataset_generator/prompts/qa.py b/packages/ragbits-evaluate/src/ragbits/evaluate/dataset_generator/prompts/qa.py new file mode 100644 index 00000000..40e29078 --- /dev/null +++ b/packages/ragbits-evaluate/src/ragbits/evaluate/dataset_generator/prompts/qa.py @@ -0,0 +1,85 @@ +from pydantic import BaseModel + +from ragbits.core.prompt import Prompt + + +class BasicAnswerGenInput(BaseModel): + """An input definition for basic answer generation task""" + + chunk: str + question: str + + +class BasicAnswerGenPrompt(Prompt[BasicAnswerGenInput, str]): + """A prompt clas for basic answers generation""" + + system_prompt: str = ( + "You are an AI assistant to answer the given question in the provided " + "evidence text. Do not mention any of these in the answer: 'in the " + "given text', 'in the provided information', etc. Users do not know " + "the passage source of the answer, so it should not be mentioned in " + "the answer. You can find the evidence from the given text about the " + "question, and you have to write a proper answer to the given question. " + "If you don't know the answer just say: I don't know." + ) + + user_prompt: str = "Text:\n<|text_start|>\n {{ chunk }} \n<|text_end|>\n\nQuestion:\n " "{{ question }} \n\nAnswer:" + + +class PassagesGenInput(BaseModel): + """An input definition to passage generation prompt""" + + question: str + basic_answer: str + chunk: str + + +class PassagesGenPrompt(Prompt[PassagesGenInput, str]): + """A prompt class for passages generation""" + + system_prompt: str = ( + "You are an AI tasked with retrieving passages (one or many) from the " + "provided Chunk that contain information needed to generate the " + "provided Answer to the given Question.\n\nInstructions:\n1. Each " + "Passage MUST be VERBATIM and EXACT, without any modifications\n2. " + "Please provide the response in the form of a Python list. It should " + "begin with '[' and end with ']'\n3. You MUST start your answer with " + "'['\n4. The Chunk ALWAYS contains information needed to justify the " + "Answer\n5. Each passage must be as BRIEF as possible; DO NOT RETURN " + "FULL SENTENCES" + ) + + user_prompt: str = ( + "Question:\n {{ question }} \nAnswer:\n {{ basic_answer }} \nChunk:\n " "{{ chunk }}\n\nPassages:" + ) + + +class QueryGenInput(BaseModel): + """An input definition for query generation prompt""" + + chunk: str + + +class QueryGenPrompt(Prompt[QueryGenInput, str]): + """A prompt class for query generation""" + + system_prompt: str = ( + "You're an AI tasked to convert Text into a factoid question. Factoid " + "questions are those seeking brief, factual information that can be " + "easily verified. They typically require a yes or no answer or a brief " + "explanation and often inquire about specific details such as dates, " + "names, places, or events.\n\nExamples of factoid questions include:\n" + "- What is the incoming shipment report?\n- What angle should I set my " + "ladder at?\n- What documents do I need to be a proof of transaction?\n\n" + "Instructions:\n1. Questions MUST BE extracted from given Text\n2. " + "Questions MUST BE as SHORT as possible\n3. Questions should be as " + "detailed as possible from Text\n4. Create questions that ask about " + "factual information from the Text\n5. Only return ONE question\n6. " + "Frame questions in a first-person, INFORMAL style, as if the employee " + "is seeking advice or clarification while working\n7. Do not mention any " + "of these in the questions: 'in the given text', 'in the provided " + "information', etc. Users do not know the passage source of the question, " + "so it should not be mentioned in the question." + ) + + user_prompt: str = "Text: {{ chunk }}\n\nGenerated Question from the Text:\n" diff --git a/packages/ragbits-evaluate/src/ragbits/evaluate/dataset_generator/tasks/__init__.py b/packages/ragbits-evaluate/src/ragbits/evaluate/dataset_generator/tasks/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/packages/ragbits-evaluate/src/ragbits/evaluate/dataset_generator/tasks/corpus_generation.py b/packages/ragbits-evaluate/src/ragbits/evaluate/dataset_generator/tasks/corpus_generation.py new file mode 100644 index 00000000..9fe203d1 --- /dev/null +++ b/packages/ragbits-evaluate/src/ragbits/evaluate/dataset_generator/tasks/corpus_generation.py @@ -0,0 +1,69 @@ +import asyncio +import sys +from copy import deepcopy + +from distilabel.steps import StepInput, StepOutput +from distilabel.steps.base import Step + +from ragbits.core.llms.base import LLM +from ragbits.core.prompt import Prompt +from ragbits.core.utils.config_handling import get_cls_from_config + +module = sys.modules[__name__] + + +class CorpusGenerationStep(Step): + """A step for corpus generation on given topics""" + + def __init__( + self, + llm: LLM, + num_per_topic: int, + prompt_class: str | type[Prompt], + ): + super().__init__() + self._llm = llm + self._prompt_class = ( + get_cls_from_config(prompt_class, module) if isinstance(prompt_class, str) else prompt_class + ) + self._num_per_topic = num_per_topic + + @property + def inputs(self) -> list[str]: + """ + A property defining input fields for a task + Returns: + list of input fields + """ + return ["query"] + + @property + def outputs(self) -> list[str]: + """ + A property describing output fields for a step + Returns: + list of output fields + """ + return ["chunk"] + + def process(self, *inputs: StepInput) -> "StepOutput": + """ + Generates the corpus data for a given topics + Args: + inputs: a topics on which the corpus data should be generated + Returns: + a generated corpus + """ + result = asyncio.get_event_loop().run_until_complete(self._process_topics(topics=inputs[0])) + yield result + + async def _process_topics(self, topics: list[dict]) -> list[dict]: + tasks = [self._process_topic(topic) for _ in range(self._num_per_topic) for topic in topics] + results = await asyncio.gather(*tasks) + return results + + async def _process_topic(self, topic: dict) -> dict: + new_inp = deepcopy(topic) + prompt_inp = self._prompt_class.input_type(**{self.inputs[0]: new_inp[self.inputs[0]]}) # type: ignore + new_inp[self.outputs[0]] = await self._llm.generate(prompt=self._prompt_class(prompt_inp)) + return new_inp diff --git a/packages/ragbits-evaluate/src/ragbits/evaluate/dataset_generator/tasks/filter/__init__.py b/packages/ragbits-evaluate/src/ragbits/evaluate/dataset_generator/tasks/filter/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/packages/ragbits-evaluate/src/ragbits/evaluate/dataset_generator/tasks/filter/base.py b/packages/ragbits-evaluate/src/ragbits/evaluate/dataset_generator/tasks/filter/base.py new file mode 100644 index 00000000..51ed892a --- /dev/null +++ b/packages/ragbits-evaluate/src/ragbits/evaluate/dataset_generator/tasks/filter/base.py @@ -0,0 +1,43 @@ +from abc import ABC, abstractmethod + +from distilabel.steps import Step, StepInput, StepOutput + +from ..corpus_generation import CorpusGenerationStep +from ..text_generation.base import BaseDistilabelTask + + +class BaseFilter(Step, ABC): + """Base class for filtering the outputs of pipeline steps""" + + def __init__(self, task: BaseDistilabelTask | CorpusGenerationStep): + super().__init__() + self._task = task + + @property + def inputs(self) -> list[str]: + """ + Property describing input fields for a filter + Returns: + list of input fields for a filter + """ + return self._task.outputs + + @property + def outputs(self) -> list[str]: + """ + Property describing output fields for a filter + Returns: + list of output fields for a filter + """ + return self._task.outputs + + @abstractmethod + def process(self, *inputs: StepInput) -> "StepOutput": + """ + Abstract method for filter step processing + Args: + inputs - inputs to a filter + Returns: + filtered outputs + """ + pass diff --git a/packages/ragbits-evaluate/src/ragbits/evaluate/dataset_generator/tasks/filter/dont_know.py b/packages/ragbits-evaluate/src/ragbits/evaluate/dataset_generator/tasks/filter/dont_know.py new file mode 100644 index 00000000..da473ac4 --- /dev/null +++ b/packages/ragbits-evaluate/src/ragbits/evaluate/dataset_generator/tasks/filter/dont_know.py @@ -0,0 +1,34 @@ +from typing import Any + +from distilabel.steps import StepInput, StepOutput + +from .base import BaseFilter + +DONT_KNOW_PHRASES: list[str] = [ + "I don't know", + "I do not know", + "don't know", +] + + +class DontKnowFilter(BaseFilter): + """A class for basic rule-based filtering of don't know anwers""" + + def process(self, *inputs: StepInput) -> "StepOutput": + """ + Runs the basic rule-based filtering of the inputs + Args: + inputs - the outputs of some generation step + Returns: + outputs filtered to the ones that do not contain the pre-defined phrases + """ + result = [ + {input_type: input_[input_type] for input_type in input_} + for input_ in inputs[0] + if not self._is_dont_know(input_) + ] + yield result + + @staticmethod + def _is_dont_know(input_: dict[str, Any]) -> bool: + return any(s.lower() in input_["basic_answer"].lower() for s in DONT_KNOW_PHRASES) diff --git a/packages/ragbits-evaluate/src/ragbits/evaluate/dataset_generator/tasks/text_generation/__init__.py b/packages/ragbits-evaluate/src/ragbits/evaluate/dataset_generator/tasks/text_generation/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/packages/ragbits-evaluate/src/ragbits/evaluate/dataset_generator/tasks/text_generation/base.py b/packages/ragbits-evaluate/src/ragbits/evaluate/dataset_generator/tasks/text_generation/base.py new file mode 100644 index 00000000..ba9a7154 --- /dev/null +++ b/packages/ragbits-evaluate/src/ragbits/evaluate/dataset_generator/tasks/text_generation/base.py @@ -0,0 +1,68 @@ +import sys +from abc import ABC, abstractmethod +from typing import Any + +from distilabel.llms.base import LLM +from distilabel.steps.tasks import TextGeneration + +from ragbits.core.prompt import ChatFormat, Prompt +from ragbits.core.utils.config_handling import get_cls_from_config + +module = sys.modules[__name__] + + +class BaseDistilabelTask(TextGeneration, ABC): + """Base class for distilabel TextGeneration tasks""" + + def __init__(self, llm: LLM, inputs: list[str], outputs: list[str], prompt_class: str | type[Prompt]): + super().__init__(llm=llm) + self._inputs = inputs + self._outputs = outputs + self._prompt_class = ( + get_cls_from_config(prompt_class, module) if isinstance(prompt_class, str) else prompt_class + ) + + @property + def inputs(self) -> list[str]: + """ + Property describing input fields for a task + Returns: + list of input fields for a task + """ + return self._inputs + + @property + def outputs(self) -> list[str]: + """ + Property describing output fields of the task + Returns: + list of outputs for a task + """ + return self._outputs + + def format_input(self, input: dict[str, Any]) -> ChatFormat: + """ + Formats the input data for generating a question based on the provided "chunk". + + Args: + input: A dictionary containing a single "chunk" key with the text input. + + Returns: + The formatted chat object containing the input for query generation. + """ + chat = self._prompt_class(self._prompt_class.input_type(**input)).chat # type: ignore + return chat + + @abstractmethod + def format_output(self, output: str, input: dict[str, Any] | None = None) -> dict[str, str | list[str]]: + """ + Formats the generated question into a structured dictionary with the original "chunk" input. + + Args: + output: The generated question. + input: Optional; contains "chunk" key with the original input chunk. + + Returns: + A dictionary containing "chunk" and "question". + """ + pass diff --git a/packages/ragbits-evaluate/src/ragbits/evaluate/dataset_generator/tasks/text_generation/qa.py b/packages/ragbits-evaluate/src/ragbits/evaluate/dataset_generator/tasks/text_generation/qa.py new file mode 100644 index 00000000..219545d0 --- /dev/null +++ b/packages/ragbits-evaluate/src/ragbits/evaluate/dataset_generator/tasks/text_generation/qa.py @@ -0,0 +1,96 @@ +from typing import Any + +from distilabel.llms.base import LLM + +from ...utils import get_closest_substring, get_passages_list +from .base import BaseDistilabelTask + + +class QueryGenTask(BaseDistilabelTask): + """ + A task for generating a question based on a provided text chunk. + """ + + def __init__(self, llm: LLM, prompt_class: str): + super().__init__(llm=llm, inputs=["chunk"], outputs=["question", "chunk"], prompt_class=prompt_class) + + def format_output(self, output: str, input: dict[str, Any] | None = None) -> dict[str, str | list[str]]: # noqa: PLR6301 + """ + Formats the generated question into a structured dictionary with the original "chunk" input. + + Args: + output: The generated question. + input: Optional; contains "chunk" key with the original input chunk. + + Returns: + A dictionary containing "chunk" and "question". + """ + return {"chunk": input["chunk"], "question": output} # type: ignore + + +class PassagesGenTask(BaseDistilabelTask): + """ + A task for generating passages related to a specific question and answer from a text chunk. + """ + + should_get_matches: bool = False + + def __init__(self, llm: LLM, prompt_class: str): + super().__init__( + llm=llm, + inputs=["chunk", "question", "basic_answer"], + outputs=["question", "chunk", "passages"], + prompt_class=prompt_class, + ) + + def format_output(self, output: str, input: dict[str, Any] | None = None) -> dict[str, str | list[str]]: + """ + Formats the model's output into a structured dictionary with "question", "chunk", and "passages". + If `get_matches` is `True`, attempts to find the closest matches for each passage within the + provided chunk. + + Args: + output: The raw output generated by the text generation model. + input: Required if `get_matches` is `True`, containing "chunk" and "question". + + Returns: + A dictionary with "chunk", "question", and a list of "passages". + """ + passages: list[str] = get_passages_list(output) or [] + + if self.should_get_matches: + matched_passages: list[str] = [] + + for passage in passages: + if passage in input["chunk"]: # type: ignore + matched_passages.append(passage) + else: + matched_passage = get_closest_substring(input["chunk"], passage) # type: ignore + matched_passages.append(matched_passage) + + return {"chunk": input["chunk"], "question": input["question"], "passages": matched_passages} # type: ignore + + return {"chunk": input["chunk"], "question": input["question"], "passages": passages} # type: ignore + + +class AnswerGenTask(BaseDistilabelTask): + """ + A task for generating basic answers to questions based on a provided text chunk. This class extends + the `TextGeneration` task from the `distilabel` package. + """ + + def __init__(self, llm: LLM, prompt_class: str): + super().__init__(llm=llm, inputs=["chunk", "question"], outputs=["basic_answer"], prompt_class=prompt_class) + + def format_output(self, output: str, input: dict[str, Any] | None = None) -> dict[str, str | list[str]]: # noqa: PLR6301 + """ + Formats the model's output into a structured dictionary with the "basic_answer" key. + + Args: + output: The raw output generated by the text generation model. + input: Optional; not typically used in this formatting. + + Returns: + A dictionary with "basic_answer" as the key and the generated output as its value. + """ + return {"basic_answer": output} diff --git a/packages/ragbits-evaluate/src/ragbits/evaluate/dataset_generator/utils.py b/packages/ragbits-evaluate/src/ragbits/evaluate/dataset_generator/utils.py new file mode 100644 index 00000000..19e19e2b --- /dev/null +++ b/packages/ragbits-evaluate/src/ragbits/evaluate/dataset_generator/utils.py @@ -0,0 +1,43 @@ +import json +import re +import warnings +from difflib import SequenceMatcher +from itertools import combinations + + +def get_closest_substring(long: str, short: str) -> str: + """ + Finds the closest substring to short string in longer one + Args: + long: str - longer string + short: str - shorter string + Returns: + closest substring of longer + """ + a, b = max( + combinations(re.finditer("|".join(short.split()), long), 2), + key=lambda c: SequenceMatcher(None, long[c[0].start() : c[1].end()], short).ratio(), + ) + return long[a.start() : b.end()] + + +def get_passages_list(raw_passages: str) -> list[str]: + """ + Formats LLM output to list of passages + Args: + raw_passages: string representing raw passages returned by llm + Returns: + list of parsed passages + """ + match = re.search(r"\[(.*?)\]", raw_passages, re.DOTALL) + + if match: + passages_content = match.group(1) + try: + return json.loads("[" + passages_content + "]") + except (SyntaxError, ValueError): + warnings.warn("Unable to evaluate the passages content. Check the format.", category=UserWarning) + return [] + else: + warnings.warn(message="No brackets found in the input string.", category=UserWarning) + return [] diff --git a/packages/ragbits-evaluate/src/ragbits/evaluate/utils.py b/packages/ragbits-evaluate/src/ragbits/evaluate/utils.py index 2b801424..f2ea8fb8 100644 --- a/packages/ragbits-evaluate/src/ragbits/evaluate/utils.py +++ b/packages/ragbits-evaluate/src/ragbits/evaluate/utils.py @@ -4,6 +4,7 @@ from pathlib import Path from typing import Any +from datasets import Dataset from hydra.core.hydra_config import HydraConfig from neptune import Run from neptune.utils import stringify_unsupported @@ -49,6 +50,23 @@ def log_to_file(results: dict[str, Any], output_dir: Path | None = None) -> Path return output_dir +def log_dataset_to_file(dataset: Dataset, output_dir: Path | None = None) -> Path: + """ + Log the evaluation results locally. + + Args: + dataset: Huggingface dataset to be logged. + output_dir: The output directory. + + Returns: + The output directory. + """ + output_dir = output_dir or Path(HydraConfig.get().runtime.output_dir) + dataset_file = output_dir / "dataset.hf" + dataset.save_to_disk(dataset_path=str(dataset_file)) + return output_dir + + def log_optimization_to_file( results: list[tuple[DictConfig, float, dict[str, float]]], output_dir: Path | None = None ) -> Path: diff --git a/uv.lock b/uv.lock index c89e5221..0d7e6720 100644 --- a/uv.lock +++ b/uv.lock @@ -871,6 +871,31 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/c9/7a/cef76fd8438a42f96db64ddaa85280485a9c395e7df3db8158cfec1eee34/dill-0.3.8-py3-none-any.whl", hash = "sha256:c36ca9ffb54365bdd2f8eb3eff7d2a21237f8452b57ace88b1ac615b7e815bd7", size = 116252 }, ] +[[package]] +name = "distilabel" +version = "1.4.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "datasets" }, + { name = "httpx" }, + { name = "jinja2" }, + { name = "multiprocess" }, + { name = "nest-asyncio" }, + { name = "networkx" }, + { name = "orjson" }, + { name = "portalocker" }, + { name = "pydantic" }, + { name = "rich" }, + { name = "scipy" }, + { name = "tblib" }, + { name = "typer" }, + { name = "universal-pathlib" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/2e/1b/331aeeda851a888e8bff84b8074cb1301909b06e509140a85a23dd1345cf/distilabel-1.4.1.tar.gz", hash = "sha256:0c373be234e8f2982ec7f940d9a95585b15306b6ab5315f5a6a45214d8f34006", size = 6420123 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/b6/b3/62d07a936cd9c3039d811681c33b9fc898e48219cf22c9186954e2575365/distilabel-1.4.1-py3-none-any.whl", hash = "sha256:4643da7f3abae86a330d86d1498443ea56978e462e21ae3d106a4c6013386965", size = 442152 }, +] + [[package]] name = "distlib" version = "0.3.8" @@ -2686,7 +2711,7 @@ name = "nvidia-cudnn-cu12" version = "8.9.2.26" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "nvidia-cublas-cu12", marker = "(platform_machine != 'aarch64' and platform_system != 'Darwin') or (platform_system != 'Darwin' and platform_system != 'Linux')" }, + { name = "nvidia-cublas-cu12" }, ] wheels = [ { url = "https://files.pythonhosted.org/packages/ff/74/a2e2be7fb83aaedec84f391f082cf765dfb635e7caa9b49065f73e4835d8/nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl", hash = "sha256:5ccb288774fdfb07a7e7025ffec286971c06d8d7b4fb162525334616d7629ff9", size = 731725872 }, @@ -2713,9 +2738,9 @@ name = "nvidia-cusolver-cu12" version = "11.4.5.107" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "nvidia-cublas-cu12", marker = "(platform_machine != 'aarch64' and platform_system != 'Darwin') or (platform_system != 'Darwin' and platform_system != 'Linux')" }, - { name = "nvidia-cusparse-cu12", marker = "(platform_machine != 'aarch64' and platform_system != 'Darwin') or (platform_system != 'Darwin' and platform_system != 'Linux')" }, - { name = "nvidia-nvjitlink-cu12", marker = "(platform_machine != 'aarch64' and platform_system != 'Darwin') or (platform_system != 'Darwin' and platform_system != 'Linux')" }, + { name = "nvidia-cublas-cu12" }, + { name = "nvidia-cusparse-cu12" }, + { name = "nvidia-nvjitlink-cu12" }, ] wheels = [ { url = "https://files.pythonhosted.org/packages/bc/1d/8de1e5c67099015c834315e333911273a8c6aaba78923dd1d1e25fc5f217/nvidia_cusolver_cu12-11.4.5.107-py3-none-manylinux1_x86_64.whl", hash = "sha256:8a7ec542f0412294b15072fa7dab71d31334014a69f953004ea7a118206fe0dd", size = 124161928 }, @@ -2726,7 +2751,7 @@ name = "nvidia-cusparse-cu12" version = "12.1.0.106" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "nvidia-nvjitlink-cu12", marker = "(platform_machine != 'aarch64' and platform_system != 'Darwin') or (platform_system != 'Darwin' and platform_system != 'Linux')" }, + { name = "nvidia-nvjitlink-cu12" }, ] wheels = [ { url = "https://files.pythonhosted.org/packages/65/5b/cfaeebf25cd9fdec14338ccb16f6b2c4c7fa9163aefcf057d86b9cc248bb/nvidia_cusparse_cu12-12.1.0.106-py3-none-manylinux1_x86_64.whl", hash = "sha256:f3b50f42cf363f86ab21f720998517a659a48131e8d538dc02f8768237bd884c", size = 195958278 }, @@ -3918,6 +3943,7 @@ name = "ragbits-evaluate" version = "0.3.0" source = { editable = "packages/ragbits-evaluate" } dependencies = [ + { name = "distilabel" }, { name = "hydra-core" }, { name = "neptune" }, { name = "optuna" }, @@ -3941,6 +3967,7 @@ dev = [ [package.metadata] requires-dist = [ { name = "continuous-eval", marker = "extra == 'relari'", specifier = "~=0.3.12" }, + { name = "distilabel", specifier = "==1.4.1" }, { name = "hydra-core", specifier = "~=1.3.2" }, { name = "neptune", specifier = "~=1.12.0" }, { name = "optuna", specifier = "==4.0.0" }, @@ -4803,6 +4830,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/40/44/4a5f08c96eb108af5cb50b41f76142f0afa346dfa99d5296fe7202a11854/tabulate-0.9.0-py3-none-any.whl", hash = "sha256:024ca478df22e9340661486f85298cff5f6dcdba14f3813e8830015b9ed1948f", size = 35252 }, ] +[[package]] +name = "tblib" +version = "3.0.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/1a/df/4f2cd7eaa6d41a7994d46527349569d46e34d9cdd07590b5c5b0dcf53de3/tblib-3.0.0.tar.gz", hash = "sha256:93622790a0a29e04f0346458face1e144dc4d32f493714c6c3dff82a4adb77e6", size = 30616 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/9b/87/ce70db7cae60e67851eb94e1a2127d4abb573d3866d2efd302ceb0d4d2a5/tblib-3.0.0-py3-none-any.whl", hash = "sha256:80a6c77e59b55e83911e1e607c649836a69c103963c5f28a46cbeef44acf8129", size = 12478 }, +] + [[package]] name = "tenacity" version = "8.5.0" @@ -5018,7 +5054,7 @@ name = "triton" version = "2.2.0" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "filelock", marker = "(python_full_version < '3.12' and platform_machine != 'aarch64' and platform_system != 'Darwin') or (python_full_version < '3.12' and platform_system != 'Darwin' and platform_system != 'Linux')" }, + { name = "filelock", marker = "python_full_version < '3.12'" }, ] wheels = [ { url = "https://files.pythonhosted.org/packages/95/05/ed974ce87fe8c8843855daa2136b3409ee1c126707ab54a8b72815c08b49/triton-2.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a2294514340cfe4e8f4f9e5c66c702744c4a117d25e618bd08469d0bfed1e2e5", size = 167900779 }, @@ -5090,6 +5126,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/a6/ab/7e5f53c3b9d14972843a647d8d7a853969a58aecc7559cb3267302c94774/tzdata-2024.2-py2.py3-none-any.whl", hash = "sha256:a48093786cdcde33cad18c2555e8532f34422074448fbc874186f0abd79565cd", size = 346586 }, ] +[[package]] +name = "universal-pathlib" +version = "0.2.5" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "fsspec" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/24/67/6c31ba464eafda05c677628dd7859ed4904597a78694d9cc81b593c6bad2/universal_pathlib-0.2.5.tar.gz", hash = "sha256:ea5d4fb8178c2ab469cf4fa46d0ceb16ccb378da46dbbc28a8b9c1eebdccc655", size = 174755 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/95/d9/289d308f889aac33639703a60906e3a0f3ec97419b7ca5bedaddc77648fd/universal_pathlib-0.2.5-py3-none-any.whl", hash = "sha256:a634f700eca827b4ad03bfa0267e51161560dd1de83b051cf0fccf39b3e56b32", size = 49892 }, +] + [[package]] name = "unstructured" version = "0.15.13"