-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
0 parents
commit fc67468
Showing
151 changed files
with
22,557 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
## Data Synthesis | ||
|
||
### Setup | ||
``` | ||
conda create -n datasyn python=3.10 | ||
conda activate datasyn | ||
pip install -r requirements.txt | ||
``` | ||
|
||
### Run | ||
|
||
The configuration has been moved to `config.yaml` | ||
|
||
Once you have the desired configuration, run: | ||
``` | ||
python run.py --config_path | ||
``` | ||
|
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,108 @@ | ||
# The model to use in generation. Available models: https://platform.openai.com/docs/models/continuous-model-upgrades | ||
model: "gpt-4" | ||
|
||
# OpenAI API key (if null, uses environment variable OPENAI_API_KEY) | ||
openai_api_key: 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 | ||
|
||
# Optionally configure the OpenAI organization ID. | ||
organization_id: | ||
|
||
# Combined output file path. | ||
output_path: instructions.jsonl | ||
|
||
# Path to the default topics file. | ||
topics_path: topics.txt | ||
|
||
# Overwrite the output file, use with care! | ||
overwrite: false | ||
|
||
# Append to the output file. | ||
append: true | ||
|
||
# Embedding model, for calculating similarity between documents; probably best left as-is since the code is fairly specific to this one. | ||
embedding_model: thenlper/gte-small | ||
embedding_device: cpu | ||
# If you have a GPU, set this to "cuda", e.g.: | ||
# embedding_device: cuda | ||
|
||
# Topic avoidance prompt string. | ||
topic_avoidance: Avoid any tasks that would be related to climate change, green tech, renewable energy, DEI (diversity, equity, inclusion), sex and/or gender, religion, politics, social issues, race, ethnicity, artificial intelligence, baking/cooking, urban development, or any topic that you would likely not respond to, or any task which a language model would not be able to respond to, e.g. tasks about emotions, feelings, physical senses, etc. | ||
|
||
# Regexes used to filter responses, mostly common words and phrases used in refusals. | ||
response_filters: | ||
- "my programming" | ||
- "openai" | ||
- "language model" | ||
- "large language" | ||
- "as an? (ai|generative language|gpt|bot)" | ||
- "illegal and dangerous" | ||
- "i do(n't| not) (possess|have|exhibit) (personal|consciousness|subjective)" | ||
- "personal (feelings|thoughts|emotions|desires|experiences|goals|objective|belief)" | ||
- "(can('t| ?not)|w(on't|will not)|unable.?) (\\w+\\s)+(with (that|your)|your \\w+|provide)" | ||
- "my limitations" | ||
- "the limitations of my" | ||
- "my abilities" | ||
- "violates my" | ||
- "i (can('t| ?not)|w(on't|will not)|am (not |un)able.?).{0,30}(you are|you're|your )" | ||
- "please note that" | ||
- "flesch" | ||
|
||
# Optionally limit the maximum number of tokens to use when generating instructions. | ||
max_tokens: | ||
|
||
# Minimum similarity score when checking for duplicates. | ||
min_docsearch_score: 0.07 | ||
|
||
# Default OpenAI API request parameters. | ||
api_params: | ||
temperature: 0.7 | ||
top_p: 0.5 | ||
frequency_penalty: 0.0 | ||
presence_penalty: 2 | ||
|
||
# Topic generation prompt. | ||
topic_prompt: Give me a numbered list of 20 completely random topics. {topic_avoidance} | ||
topic_request_count: 20 | ||
|
||
# Default count per generator, if not specified. | ||
default_count: 100 | ||
|
||
# Default batch size, if not specified. | ||
default_batch_size: 10 | ||
|
||
# Default readability score hint: https://en.wikipedia.org/wiki/Flesch%E2%80%93Kincaid_readability_tests | ||
default_flesch: The output should be written in such a way as to have a Flesch-Kincaid readability score of 30 or lower - best understood by those with college education. The response must not contain any notes or information about Flesch-Kincaid scores. | ||
|
||
# Language. | ||
language: English | ||
|
||
# Individual instructor configurations. | ||
instructors: | ||
|
||
################################################################################## | ||
# BBH-Hard Chain-of-thought. | ||
bbh_hard: | ||
count: 50 | ||
batch_size: 4 | ||
min_docsearch_score: 0.15 | ||
prompt_path: bbh_hard.txt | ||
|
||
################################################################################## | ||
# AGIEval Chain-of-thought. | ||
agieval: | ||
count: 50 | ||
batch_size: 2 | ||
min_docsearch_score: 0.2 | ||
prompt_path: agieval.txt | ||
|
||
################################################################################## | ||
# Character cards - these aren't used directly, they are stored in output_dir, and | ||
# used by the chat instructor, stylized response, etc. | ||
character: | ||
api_params: | ||
temperature: 0.9 | ||
count: 25 | ||
batch_size: 1 | ||
min_docsearch_score: 0.1 | ||
seed_path: character_seeds | ||
output_dir: characters |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,52 @@ | ||
import numpy as np | ||
import torch | ||
from typing import Any, List | ||
|
||
|
||
# Max tokens for our embedding model. This code is really designed for the gte-* | ||
# series, e.g.: https://huggingface.co/thenlper/gte-small | ||
# but could in theory be generated to work with other models I suspect. | ||
MAX_LENGTH = 512 | ||
|
||
|
||
def average_pool( | ||
last_hidden_states: torch.Tensor, attention_mask: torch.Tensor | ||
) -> torch.Tensor: | ||
last_hidden = last_hidden_states.masked_fill(~attention_mask[..., None].bool(), 0.0) | ||
return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None] | ||
|
||
|
||
def calculate_fragment_embeddings(model: Any, fragment: str) -> List[float]: | ||
"""Calculate vector embeddings for a single input fragment, which is smaller than the | ||
max model length. | ||
""" | ||
with torch.no_grad(): | ||
return model.encode(fragment, normalize_embeddings=True) | ||
|
||
|
||
def calculate_embeddings(input_text: str, model: Any, tokenizer: Any) -> List[float]: | ||
"""Calculate the vector embeddings for the specified input text. | ||
1. split the text based on the model's max sequence length | ||
2. calculate the embeddings for each chunk | ||
3. calculate the average embedding across all chunks | ||
""" | ||
|
||
# Tokenize the input, and convert tokens into chunks based on max model size. | ||
inputs = tokenizer(input_text, padding=False, truncation=False, return_tensors="pt") | ||
chunks = [ | ||
torch.Tensor(inputs["input_ids"][0][i : i + MAX_LENGTH].tolist()).int() | ||
for i in range(0, len(inputs["input_ids"][0]), MAX_LENGTH) | ||
] | ||
fragments = [tokenizer.decode(chunk) for chunk in chunks] | ||
|
||
# Now, calculate embeddings for each fragment. | ||
all_embeddings = [] | ||
lengths = [] | ||
for fragment in fragments: | ||
lengths.append(len(fragment)) | ||
all_embeddings.append(calculate_fragment_embeddings(model, fragment)) | ||
|
||
# Finally, calculate the average across all fragments. | ||
embeddings = np.average(all_embeddings, axis=0, weights=lengths) | ||
return embeddings / np.linalg.norm(embeddings) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,26 @@ | ||
class RateLimitError(RuntimeError): | ||
... | ||
|
||
|
||
class TooManyRequestsError(RuntimeError): | ||
... | ||
|
||
|
||
class BadResponseError(RuntimeError): | ||
... | ||
|
||
|
||
class TokensExhaustedError(RuntimeError): | ||
... | ||
|
||
|
||
class ContextLengthExceededError(RuntimeError): | ||
... | ||
|
||
|
||
class ServerOverloadedError(RuntimeError): | ||
... | ||
|
||
|
||
class ServerError(RuntimeError): | ||
... |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
from instructors.inline_qa import generate as generate_inline | ||
|
||
|
||
async def generate(instructor, **kwargs): | ||
"""Generator for agent/router training data.""" | ||
async for item in generate_inline( | ||
instructor, "agent", start_key="PROMPT", filter_response=False, **kwargs | ||
): | ||
yield item |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,160 @@ | ||
import asyncio | ||
import json | ||
import os | ||
import random | ||
import re | ||
from termcolor import colored | ||
|
||
|
||
async def generate(instructor, **kwargs): | ||
"""Generator for agieval chain-of-thought training data.""" | ||
async for item in generate_agieval( | ||
instructor, "agieval", filter_response=False, **kwargs | ||
): | ||
yield item | ||
|
||
|
||
async def generate_agieval( | ||
instructor, | ||
category, | ||
filter_response=True, | ||
only_instructions=False, | ||
template_kwargs={}, | ||
**kwargs, | ||
): | ||
"""Generator for simple instruction response tasks (e.g. roleplay, wordgames).""" | ||
config = instructor.instructors.get(category) | ||
if not config: | ||
return | ||
target_count = config.get("count") | ||
if target_count is None: | ||
target_count = instructor.default_count | ||
target_count = int(target_count) | ||
if not target_count: | ||
return | ||
|
||
# Load the prompt template. | ||
path = config.get("prompt_path", f"{category}.txt") | ||
if not os.path.exists(path): | ||
path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "prompts", path) | ||
with open(path) as infile: | ||
template = infile.read() | ||
|
||
# Response prompt template (optional). | ||
response_prompt = None | ||
path = config.get("response_prompt_path", f"{category}_response.txt") | ||
if not os.path.exists(path): | ||
path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "prompts", path) | ||
if os.path.exists(path): | ||
with open(path) as infile: | ||
response_prompt = infile.read() | ||
|
||
# API params, overriding defaults with this instructor's config. | ||
api_params = {**instructor.api_params, **config.get("api_params", {})} | ||
|
||
# Min similarity score. | ||
min_score = config.get("min_docsearch_score") | ||
if min_score is None: | ||
min_score = instructor.min_docsearch_score | ||
min_score = float(min_score) | ||
|
||
# Load the topics. | ||
topics = instructor.get_instructor_topics(config) | ||
topic_index = random.randint(0, len(topics) - 1) | ||
|
||
# Generate the instruction/response pairs until we reach the target count. | ||
batch_size = config.get("batch_size") | ||
if batch_size is None: | ||
batch_size = instructor.default_batch_size | ||
batch_size = int(batch_size) | ||
if category not in instructor.instructor_counts: | ||
instructor.instructor_counts[category] = 0 | ||
language = config.get("language") or instructor.language | ||
flesch = config.get("flesch") or instructor.default_flesch | ||
while instructor.instructor_counts[category] < target_count: | ||
format_args = {"language": language, "flesch": flesch} | ||
if "{batch_size}" in template: | ||
format_args["batch_size"] = batch_size | ||
for key, val in template_kwargs.items(): | ||
format_args[key] = val(instructor) | ||
if "{topic_avoidance}" in template: | ||
format_args["topic_avoidance"] = instructor.topic_avoidance | ||
if "{topics}" in template: | ||
# Inject the topics to use for this batch. | ||
current_topics = [] | ||
for _ in range(batch_size): | ||
current_topics.append(topics[topic_index]) | ||
topic_index += 1 | ||
if topic_index >= len(topics): | ||
topic_index = 0 | ||
topics_str = "\n".join( | ||
[ | ||
f" * TSK {idx + 1} must be related to topic: {json.dumps(topic)}" | ||
for idx, topic in enumerate(current_topics) | ||
] | ||
) | ||
format_args["topics"] = topics_str | ||
if "{example1}" in template: | ||
current_script_path = os.path.realpath(__file__) | ||
data_folder_path = '/'.join(current_script_path.split('\\')[:-2]) + "/seed_data/agieval" | ||
file_name = random.choice(os.listdir(data_folder_path)) | ||
data = [] | ||
with open(f"{data_folder_path}/{file_name}", 'r', encoding='utf-8') as infile: | ||
for line in infile: | ||
data.append(json.loads(line)) | ||
random.shuffle(data) | ||
format_args["example1"] = data[0]['model_input'] | ||
format_args["example2"] = data[1]['model_input'] | ||
|
||
|
||
# Get a batch of instructions. | ||
prompt = template.format(**format_args) | ||
print(colored('prompt: '+prompt, 'blue')) | ||
response = await instructor.generate_response( | ||
prompt, filter_response=filter_response, **api_params | ||
) | ||
print(colored('response: '+response, 'yellow')) | ||
if not response: | ||
continue | ||
|
||
# Parse instructions and generate responses. | ||
futures = [] | ||
instructions = [] | ||
for instruction in re.findall( | ||
r"(?:^|\n)TSK \d+\.\s*(.*?)(?:$|(?=\nTSK \d+\.\s*))", response, re.DOTALL | ||
): | ||
if not instruction.strip() or await instructor.is_too_similar( | ||
instruction, min_score=min_score | ||
): | ||
continue | ||
instructions.append(instruction) | ||
if only_instructions: | ||
yield {"instruction": instruction} | ||
else: | ||
full_prompt = instruction | ||
if response_prompt: | ||
full_prompt = response_prompt.format( | ||
language=language, instruction=instruction, flesch=flesch | ||
) | ||
futures.append( | ||
instructor.generate_response( | ||
full_prompt, | ||
messages=kwargs.get("messages", []), | ||
filter_response=filter_response, | ||
**api_params, | ||
) | ||
) | ||
if not futures: | ||
continue | ||
responses = await asyncio.gather(*futures) | ||
for idx in range(len(responses)): | ||
response = responses[idx] | ||
if not response or not response.strip(): | ||
continue | ||
yield { | ||
"instruction": instructions[idx].strip(), | ||
"response": response.strip(), | ||
"category": category, | ||
} | ||
if instructor.instructor_counts[category] >= target_count: | ||
break |
Oops, something went wrong.