Skip to content

Commit

Permalink
feat: add data synthesis pipe
Browse files Browse the repository at this point in the history
  • Loading branch information
harshraj172 committed Sep 13, 2023
0 parents commit fc67468
Show file tree
Hide file tree
Showing 151 changed files with 22,557 additions and 0 deletions.
18 changes: 18 additions & 0 deletions data_synthesis/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
## Data Synthesis

### Setup
```
conda create -n datasyn python=3.10
conda activate datasyn
pip install -r requirements.txt
```

### Run

The configuration has been moved to `config.yaml`

Once you have the desired configuration, run:
```
python run.py --config_path
```

Empty file added data_synthesis/__init__.py
Empty file.
108 changes: 108 additions & 0 deletions data_synthesis/config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
# The model to use in generation. Available models: https://platform.openai.com/docs/models/continuous-model-upgrades
model: "gpt-4"

# OpenAI API key (if null, uses environment variable OPENAI_API_KEY)
openai_api_key: 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

# Optionally configure the OpenAI organization ID.
organization_id:

# Combined output file path.
output_path: instructions.jsonl

# Path to the default topics file.
topics_path: topics.txt

# Overwrite the output file, use with care!
overwrite: false

# Append to the output file.
append: true

# Embedding model, for calculating similarity between documents; probably best left as-is since the code is fairly specific to this one.
embedding_model: thenlper/gte-small
embedding_device: cpu
# If you have a GPU, set this to "cuda", e.g.:
# embedding_device: cuda

# Topic avoidance prompt string.
topic_avoidance: Avoid any tasks that would be related to climate change, green tech, renewable energy, DEI (diversity, equity, inclusion), sex and/or gender, religion, politics, social issues, race, ethnicity, artificial intelligence, baking/cooking, urban development, or any topic that you would likely not respond to, or any task which a language model would not be able to respond to, e.g. tasks about emotions, feelings, physical senses, etc.

# Regexes used to filter responses, mostly common words and phrases used in refusals.
response_filters:
- "my programming"
- "openai"
- "language model"
- "large language"
- "as an? (ai|generative language|gpt|bot)"
- "illegal and dangerous"
- "i do(n't| not) (possess|have|exhibit) (personal|consciousness|subjective)"
- "personal (feelings|thoughts|emotions|desires|experiences|goals|objective|belief)"
- "(can('t| ?not)|w(on't|will not)|unable.?) (\\w+\\s)+(with (that|your)|your \\w+|provide)"
- "my limitations"
- "the limitations of my"
- "my abilities"
- "violates my"
- "i (can('t| ?not)|w(on't|will not)|am (not |un)able.?).{0,30}(you are|you're|your )"
- "please note that"
- "flesch"

# Optionally limit the maximum number of tokens to use when generating instructions.
max_tokens:

# Minimum similarity score when checking for duplicates.
min_docsearch_score: 0.07

# Default OpenAI API request parameters.
api_params:
temperature: 0.7
top_p: 0.5
frequency_penalty: 0.0
presence_penalty: 2

# Topic generation prompt.
topic_prompt: Give me a numbered list of 20 completely random topics. {topic_avoidance}
topic_request_count: 20

# Default count per generator, if not specified.
default_count: 100

# Default batch size, if not specified.
default_batch_size: 10

# Default readability score hint: https://en.wikipedia.org/wiki/Flesch%E2%80%93Kincaid_readability_tests
default_flesch: The output should be written in such a way as to have a Flesch-Kincaid readability score of 30 or lower - best understood by those with college education. The response must not contain any notes or information about Flesch-Kincaid scores.

# Language.
language: English

# Individual instructor configurations.
instructors:

##################################################################################
# BBH-Hard Chain-of-thought.
bbh_hard:
count: 50
batch_size: 4
min_docsearch_score: 0.15
prompt_path: bbh_hard.txt

##################################################################################
# AGIEval Chain-of-thought.
agieval:
count: 50
batch_size: 2
min_docsearch_score: 0.2
prompt_path: agieval.txt

##################################################################################
# Character cards - these aren't used directly, they are stored in output_dir, and
# used by the chat instructor, stylized response, etc.
character:
api_params:
temperature: 0.9
count: 25
batch_size: 1
min_docsearch_score: 0.1
seed_path: character_seeds
output_dir: characters
52 changes: 52 additions & 0 deletions data_synthesis/embeddings.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
import numpy as np
import torch
from typing import Any, List


# Max tokens for our embedding model. This code is really designed for the gte-*
# series, e.g.: https://huggingface.co/thenlper/gte-small
# but could in theory be generated to work with other models I suspect.
MAX_LENGTH = 512


def average_pool(
last_hidden_states: torch.Tensor, attention_mask: torch.Tensor
) -> torch.Tensor:
last_hidden = last_hidden_states.masked_fill(~attention_mask[..., None].bool(), 0.0)
return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]


def calculate_fragment_embeddings(model: Any, fragment: str) -> List[float]:
"""Calculate vector embeddings for a single input fragment, which is smaller than the
max model length.
"""
with torch.no_grad():
return model.encode(fragment, normalize_embeddings=True)


def calculate_embeddings(input_text: str, model: Any, tokenizer: Any) -> List[float]:
"""Calculate the vector embeddings for the specified input text.
1. split the text based on the model's max sequence length
2. calculate the embeddings for each chunk
3. calculate the average embedding across all chunks
"""

# Tokenize the input, and convert tokens into chunks based on max model size.
inputs = tokenizer(input_text, padding=False, truncation=False, return_tensors="pt")
chunks = [
torch.Tensor(inputs["input_ids"][0][i : i + MAX_LENGTH].tolist()).int()
for i in range(0, len(inputs["input_ids"][0]), MAX_LENGTH)
]
fragments = [tokenizer.decode(chunk) for chunk in chunks]

# Now, calculate embeddings for each fragment.
all_embeddings = []
lengths = []
for fragment in fragments:
lengths.append(len(fragment))
all_embeddings.append(calculate_fragment_embeddings(model, fragment))

# Finally, calculate the average across all fragments.
embeddings = np.average(all_embeddings, axis=0, weights=lengths)
return embeddings / np.linalg.norm(embeddings)
26 changes: 26 additions & 0 deletions data_synthesis/exceptions.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
class RateLimitError(RuntimeError):
...


class TooManyRequestsError(RuntimeError):
...


class BadResponseError(RuntimeError):
...


class TokensExhaustedError(RuntimeError):
...


class ContextLengthExceededError(RuntimeError):
...


class ServerOverloadedError(RuntimeError):
...


class ServerError(RuntimeError):
...
Empty file.
9 changes: 9 additions & 0 deletions data_synthesis/instructors/agent.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
from instructors.inline_qa import generate as generate_inline


async def generate(instructor, **kwargs):
"""Generator for agent/router training data."""
async for item in generate_inline(
instructor, "agent", start_key="PROMPT", filter_response=False, **kwargs
):
yield item
160 changes: 160 additions & 0 deletions data_synthesis/instructors/agieval.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,160 @@
import asyncio
import json
import os
import random
import re
from termcolor import colored


async def generate(instructor, **kwargs):
"""Generator for agieval chain-of-thought training data."""
async for item in generate_agieval(
instructor, "agieval", filter_response=False, **kwargs
):
yield item


async def generate_agieval(
instructor,
category,
filter_response=True,
only_instructions=False,
template_kwargs={},
**kwargs,
):
"""Generator for simple instruction response tasks (e.g. roleplay, wordgames)."""
config = instructor.instructors.get(category)
if not config:
return
target_count = config.get("count")
if target_count is None:
target_count = instructor.default_count
target_count = int(target_count)
if not target_count:
return

# Load the prompt template.
path = config.get("prompt_path", f"{category}.txt")
if not os.path.exists(path):
path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "prompts", path)
with open(path) as infile:
template = infile.read()

# Response prompt template (optional).
response_prompt = None
path = config.get("response_prompt_path", f"{category}_response.txt")
if not os.path.exists(path):
path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "prompts", path)
if os.path.exists(path):
with open(path) as infile:
response_prompt = infile.read()

# API params, overriding defaults with this instructor's config.
api_params = {**instructor.api_params, **config.get("api_params", {})}

# Min similarity score.
min_score = config.get("min_docsearch_score")
if min_score is None:
min_score = instructor.min_docsearch_score
min_score = float(min_score)

# Load the topics.
topics = instructor.get_instructor_topics(config)
topic_index = random.randint(0, len(topics) - 1)

# Generate the instruction/response pairs until we reach the target count.
batch_size = config.get("batch_size")
if batch_size is None:
batch_size = instructor.default_batch_size
batch_size = int(batch_size)
if category not in instructor.instructor_counts:
instructor.instructor_counts[category] = 0
language = config.get("language") or instructor.language
flesch = config.get("flesch") or instructor.default_flesch
while instructor.instructor_counts[category] < target_count:
format_args = {"language": language, "flesch": flesch}
if "{batch_size}" in template:
format_args["batch_size"] = batch_size
for key, val in template_kwargs.items():
format_args[key] = val(instructor)
if "{topic_avoidance}" in template:
format_args["topic_avoidance"] = instructor.topic_avoidance
if "{topics}" in template:
# Inject the topics to use for this batch.
current_topics = []
for _ in range(batch_size):
current_topics.append(topics[topic_index])
topic_index += 1
if topic_index >= len(topics):
topic_index = 0
topics_str = "\n".join(
[
f" * TSK {idx + 1} must be related to topic: {json.dumps(topic)}"
for idx, topic in enumerate(current_topics)
]
)
format_args["topics"] = topics_str
if "{example1}" in template:
current_script_path = os.path.realpath(__file__)
data_folder_path = '/'.join(current_script_path.split('\\')[:-2]) + "/seed_data/agieval"
file_name = random.choice(os.listdir(data_folder_path))
data = []
with open(f"{data_folder_path}/{file_name}", 'r', encoding='utf-8') as infile:
for line in infile:
data.append(json.loads(line))
random.shuffle(data)
format_args["example1"] = data[0]['model_input']
format_args["example2"] = data[1]['model_input']


# Get a batch of instructions.
prompt = template.format(**format_args)
print(colored('prompt: '+prompt, 'blue'))
response = await instructor.generate_response(
prompt, filter_response=filter_response, **api_params
)
print(colored('response: '+response, 'yellow'))
if not response:
continue

# Parse instructions and generate responses.
futures = []
instructions = []
for instruction in re.findall(
r"(?:^|\n)TSK \d+\.\s*(.*?)(?:$|(?=\nTSK \d+\.\s*))", response, re.DOTALL
):
if not instruction.strip() or await instructor.is_too_similar(
instruction, min_score=min_score
):
continue
instructions.append(instruction)
if only_instructions:
yield {"instruction": instruction}
else:
full_prompt = instruction
if response_prompt:
full_prompt = response_prompt.format(
language=language, instruction=instruction, flesch=flesch
)
futures.append(
instructor.generate_response(
full_prompt,
messages=kwargs.get("messages", []),
filter_response=filter_response,
**api_params,
)
)
if not futures:
continue
responses = await asyncio.gather(*futures)
for idx in range(len(responses)):
response = responses[idx]
if not response or not response.strip():
continue
yield {
"instruction": instructions[idx].strip(),
"response": response.strip(),
"category": category,
}
if instructor.instructor_counts[category] >= target_count:
break
Loading

0 comments on commit fc67468

Please sign in to comment.