feat: add data synthesis pipe

yieldprotocol · Sep 13, 2023 · fc67468 · fc67468
commit fc67468
Show file tree

Hide file tree

Showing 151 changed files with 22,557 additions and 0 deletions.
diff --git a/data_synthesis/README.md b/data_synthesis/README.md
@@ -0,0 +1,18 @@
+## Data Synthesis
+
+### Setup
+```
+conda create -n datasyn python=3.10
+conda activate datasyn
+pip install -r requirements.txt
+```
+
+### Run
+
+The configuration has been moved to `config.yaml`
+
+Once you have the desired configuration, run:
+```
+python run.py --config_path
+```
+
diff --git a/data_synthesis/__init__.py b/data_synthesis/__init__.py
diff --git a/data_synthesis/config.yaml b/data_synthesis/config.yaml
@@ -0,0 +1,108 @@
+# The model to use in generation.  Available models: https://platform.openai.com/docs/models/continuous-model-upgrades
+model: "gpt-4"
+
+# OpenAI API key (if null, uses environment variable OPENAI_API_KEY)
+openai_api_key: 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
+
+# Optionally configure the OpenAI organization ID.
+organization_id:
+
+# Combined output file path.
+output_path: instructions.jsonl
+
+# Path to the default topics file.
+topics_path: topics.txt
+
+# Overwrite the output file, use with care!
+overwrite: false
+
+# Append to the output file.
+append: true
+
+# Embedding model, for calculating similarity between documents; probably best left as-is since the code is fairly specific to this one.
+embedding_model: thenlper/gte-small
+embedding_device: cpu
+# If you have a GPU, set this to "cuda", e.g.:
+# embedding_device: cuda
+
+# Topic avoidance prompt string.
+topic_avoidance: Avoid any tasks that would be related to climate change, green tech, renewable energy, DEI (diversity, equity, inclusion), sex and/or gender, religion, politics, social issues, race, ethnicity, artificial intelligence, baking/cooking, urban development, or any topic that you would likely not respond to, or any task which a language model would not be able to respond to, e.g. tasks about emotions, feelings, physical senses, etc.
+
+# Regexes used to filter responses, mostly common words and phrases used in refusals.
+response_filters:
+  - "my programming"
+  - "openai"
+  - "language model"
+  - "large language"
+  - "as an? (ai|generative language|gpt|bot)"
+  - "illegal and dangerous"
+  - "i do(n't| not) (possess|have|exhibit) (personal|consciousness|subjective)"
+  - "personal (feelings|thoughts|emotions|desires|experiences|goals|objective|belief)"
+  - "(can('t| ?not)|w(on't|will not)|unable.?) (\\w+\\s)+(with (that|your)|your \\w+|provide)"
+  - "my limitations"
+  - "the limitations of my"
+  - "my abilities"
+  - "violates my"
+  - "i (can('t| ?not)|w(on't|will not)|am (not |un)able.?).{0,30}(you are|you're|your )"
+  - "please note that"
+  - "flesch"
+
+# Optionally limit the maximum number of tokens to use when generating instructions.
+max_tokens:
+
+# Minimum similarity score when checking for duplicates.
+min_docsearch_score: 0.07
+
+# Default OpenAI API request parameters.
+api_params:
+  temperature: 0.7
+  top_p: 0.5
+  frequency_penalty: 0.0
+  presence_penalty: 2
+
+# Topic generation prompt.
+topic_prompt: Give me a numbered list of 20 completely random topics. {topic_avoidance}
+topic_request_count: 20
+
+# Default count per generator, if not specified.
+default_count: 100
+
+# Default batch size, if not specified.
+default_batch_size: 10
+
+# Default readability score hint: https://en.wikipedia.org/wiki/Flesch%E2%80%93Kincaid_readability_tests
+default_flesch: The output should be written in such a way as to have a Flesch-Kincaid readability score of 30 or lower - best understood by those with college education.  The response must not contain any notes or information about Flesch-Kincaid scores.
+
+# Language.
+language: English
+
+# Individual instructor configurations.
+instructors:
+
+  ##################################################################################
+  # BBH-Hard Chain-of-thought.
+  bbh_hard:
+    count: 50
+    batch_size: 4
+    min_docsearch_score: 0.15
+    prompt_path: bbh_hard.txt
+
+  ##################################################################################
+  # AGIEval Chain-of-thought.
+  agieval:
+    count: 50
+    batch_size: 2
+    min_docsearch_score: 0.2
+    prompt_path: agieval.txt
+
+  ##################################################################################
+  # Character cards - these aren't used directly, they are stored in output_dir, and
+  # used by the chat instructor, stylized response, etc.
+  character:
+    api_params:
+      temperature: 0.9
+    count: 25
+    batch_size: 1
+    min_docsearch_score: 0.1
+    seed_path: character_seeds
+    output_dir: characters
diff --git a/data_synthesis/embeddings.py b/data_synthesis/embeddings.py
@@ -0,0 +1,52 @@
+import numpy as np
+import torch
+from typing import Any, List
+
+
+# Max tokens for our embedding model.  This code is really designed for the gte-*
+# series, e.g.: https://huggingface.co/thenlper/gte-small
+# but could in theory be generated to work with other models I suspect.
+MAX_LENGTH = 512
+
+
+def average_pool(
+    last_hidden_states: torch.Tensor, attention_mask: torch.Tensor
+) -> torch.Tensor:
+    last_hidden = last_hidden_states.masked_fill(~attention_mask[..., None].bool(), 0.0)
+    return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]
+
+
+def calculate_fragment_embeddings(model: Any, fragment: str) -> List[float]:
+    """Calculate vector embeddings for a single input fragment, which is smaller than the
+    max model length.
+    """
+    with torch.no_grad():
+        return model.encode(fragment, normalize_embeddings=True)
+
+
+def calculate_embeddings(input_text: str, model: Any, tokenizer: Any) -> List[float]:
+    """Calculate the vector embeddings for the specified input text.
+
+    1. split the text based on the model's max sequence length
+    2. calculate the embeddings for each chunk
+    3. calculate the average embedding across all chunks
+    """
+
+    # Tokenize the input, and convert tokens into chunks based on max model size.
+    inputs = tokenizer(input_text, padding=False, truncation=False, return_tensors="pt")
+    chunks = [
+        torch.Tensor(inputs["input_ids"][0][i : i + MAX_LENGTH].tolist()).int()
+        for i in range(0, len(inputs["input_ids"][0]), MAX_LENGTH)
+    ]
+    fragments = [tokenizer.decode(chunk) for chunk in chunks]
+
+    # Now, calculate embeddings for each fragment.
+    all_embeddings = []
+    lengths = []
+    for fragment in fragments:
+        lengths.append(len(fragment))
+        all_embeddings.append(calculate_fragment_embeddings(model, fragment))
+
+    # Finally, calculate the average across all fragments.
+    embeddings = np.average(all_embeddings, axis=0, weights=lengths)
+    return embeddings / np.linalg.norm(embeddings)
diff --git a/data_synthesis/exceptions.py b/data_synthesis/exceptions.py
@@ -0,0 +1,26 @@
+class RateLimitError(RuntimeError):
+    ...
+
+
+class TooManyRequestsError(RuntimeError):
+    ...
+
+
+class BadResponseError(RuntimeError):
+    ...
+
+
+class TokensExhaustedError(RuntimeError):
+    ...
+
+
+class ContextLengthExceededError(RuntimeError):
+    ...
+
+
+class ServerOverloadedError(RuntimeError):
+    ...
+
+
+class ServerError(RuntimeError):
+    ...
diff --git a/data_synthesis/instructors/__init__.py b/data_synthesis/instructors/__init__.py
diff --git a/data_synthesis/instructors/agent.py b/data_synthesis/instructors/agent.py
@@ -0,0 +1,9 @@
+from instructors.inline_qa import generate as generate_inline
+
+
+async def generate(instructor, **kwargs):
+    """Generator for agent/router training data."""
+    async for item in generate_inline(
+        instructor, "agent", start_key="PROMPT", filter_response=False, **kwargs
+    ):
+        yield item
diff --git a/data_synthesis/instructors/agieval.py b/data_synthesis/instructors/agieval.py
@@ -0,0 +1,160 @@
+import asyncio
+import json
+import os
+import random
+import re
+from termcolor import colored
+
+
+async def generate(instructor, **kwargs):
+    """Generator for agieval chain-of-thought training data."""
+    async for item in generate_agieval(
+        instructor, "agieval", filter_response=False, **kwargs
+    ):
+        yield item
+
+
+async def generate_agieval(
+    instructor,
+    category,
+    filter_response=True,
+    only_instructions=False,
+    template_kwargs={},
+    **kwargs,
+):
+    """Generator for simple instruction response tasks (e.g. roleplay, wordgames)."""
+    config = instructor.instructors.get(category)
+    if not config:
+        return
+    target_count = config.get("count")
+    if target_count is None:
+        target_count = instructor.default_count
+    target_count = int(target_count)
+    if not target_count:
+        return
+
+    # Load the prompt template.
+    path = config.get("prompt_path", f"{category}.txt")
+    if not os.path.exists(path):
+        path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "prompts", path)
+    with open(path) as infile:
+        template = infile.read()
+
+    # Response prompt template (optional).
+    response_prompt = None
+    path = config.get("response_prompt_path", f"{category}_response.txt")
+    if not os.path.exists(path):
+        path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "prompts", path)
+        if os.path.exists(path):
+            with open(path) as infile:
+                response_prompt = infile.read()
+
+    # API params, overriding defaults with this instructor's config.
+    api_params = {**instructor.api_params, **config.get("api_params", {})}
+
+    # Min similarity score.
+    min_score = config.get("min_docsearch_score")
+    if min_score is None:
+        min_score = instructor.min_docsearch_score
+    min_score = float(min_score)
+
+    # Load the topics.
+    topics = instructor.get_instructor_topics(config)
+    topic_index = random.randint(0, len(topics) - 1)
+
+    # Generate the instruction/response pairs until we reach the target count.
+    batch_size = config.get("batch_size")
+    if batch_size is None:
+        batch_size = instructor.default_batch_size
+    batch_size = int(batch_size)
+    if category not in instructor.instructor_counts:
+        instructor.instructor_counts[category] = 0
+    language = config.get("language") or instructor.language
+    flesch = config.get("flesch") or instructor.default_flesch
+    while instructor.instructor_counts[category] < target_count:
+        format_args = {"language": language, "flesch": flesch}
+        if "{batch_size}" in template:
+            format_args["batch_size"] = batch_size
+        for key, val in template_kwargs.items():
+            format_args[key] = val(instructor)
+        if "{topic_avoidance}" in template:
+            format_args["topic_avoidance"] = instructor.topic_avoidance
+        if "{topics}" in template:
+            # Inject the topics to use for this batch.
+            current_topics = []
+            for _ in range(batch_size):
+                current_topics.append(topics[topic_index])
+                topic_index += 1
+                if topic_index >= len(topics):
+                    topic_index = 0
+            topics_str = "\n".join(
+                [
+                    f" * TSK {idx + 1} must be related to topic: {json.dumps(topic)}"
+                    for idx, topic in enumerate(current_topics)
+                ]
+            )
+            format_args["topics"] = topics_str
+        if "{example1}" in template:
+            current_script_path = os.path.realpath(__file__)
+            data_folder_path = '/'.join(current_script_path.split('\\')[:-2]) + "/seed_data/agieval"
+            file_name = random.choice(os.listdir(data_folder_path))
+            data = []
+            with open(f"{data_folder_path}/{file_name}", 'r', encoding='utf-8') as infile:
+                for line in infile:
+                    data.append(json.loads(line))
+            random.shuffle(data)
+            format_args["example1"] = data[0]['model_input']
+            format_args["example2"] = data[1]['model_input']
+
+
+        # Get a batch of instructions.
+        prompt = template.format(**format_args)
+        print(colored('prompt: '+prompt, 'blue'))
+        response = await instructor.generate_response(
+            prompt, filter_response=filter_response, **api_params
+        )
+        print(colored('response: '+response, 'yellow'))
+        if not response:
+            continue
+
+        # Parse instructions and generate responses.
+        futures = []
+        instructions = []
+        for instruction in re.findall(
+            r"(?:^|\n)TSK \d+\.\s*(.*?)(?:$|(?=\nTSK \d+\.\s*))", response, re.DOTALL
+        ):  
+            if not instruction.strip() or await instructor.is_too_similar(
+                instruction, min_score=min_score
+            ):
+                continue
+            instructions.append(instruction)
+            if only_instructions:
+                yield {"instruction": instruction}
+            else:
+                full_prompt = instruction
+                if response_prompt:
+                    full_prompt = response_prompt.format(
+                        language=language, instruction=instruction, flesch=flesch
+                    )
+                futures.append(
+                    instructor.generate_response(
+                        full_prompt,
+                        messages=kwargs.get("messages", []),
+                        filter_response=filter_response,
+                        **api_params,
+                    )
+                )
+        if not futures:
+            continue
+        responses = await asyncio.gather(*futures)
+        for idx in range(len(responses)):
+            response = responses[idx]
+            if not response or not response.strip():
+                continue
+            yield {
+                "instruction": instructions[idx].strip(),
+                "response": response.strip(),
+                "category": category,
+            }
+            if instructor.instructor_counts[category] >= target_count:
+                break