-
Notifications
You must be signed in to change notification settings - Fork 159
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #49 from e-p-armstrong/usability-overhaul
Augmentoolkit 3.0
- Loading branch information
Showing
195 changed files
with
12,504 additions
and
170,144 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
# This is an empty test pipeline that exists to give you a starting point, with Augmentoolkit's conventions and abstractions already in place, to start building out your own pipelines for your own usecases. | ||
|
||
Please consider opening a PR and contributing it if you make something cool! Or just use it yourself that is OK too. | ||
|
||
# If you run into problems while making a pipeline, consider creating an issue! |
File renamed without changes.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,25 @@ | ||
API: | ||
API_KEY_A: key | ||
API_KEY_B: ToIaiNGFuJ1wLNjlt8DBhMejhLJhx30ZVKVVTVQ5kLGP3YQY | ||
BASE_URL_A: https://api.together.xyz | ||
BASE_URL_B: https://api.fireworks.ai/inference/v1 | ||
LOGICAL_MODEL_A: meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo | ||
LOGICAL_MODEL_B: accounts/fireworks/models/llama-v3p1-8b-instruct | ||
MODE_A: api | ||
MODE_B: api | ||
PATH: | ||
DEFAULT_PROMPTS: ./prompts | ||
INPUT: ./raw_txt_input | ||
OUTPUT: ./output | ||
PROMPTS: ./prompts | ||
PHASES: | ||
PHASE_INDEX: 2 | ||
WORK_IN_PHASES: True | ||
SYSTEM: | ||
COMPLETION_MODE: False | ||
CONCURRENCY_LIMIT: 3 | ||
STOP: True | ||
SUBSET_SIZE: 3 | ||
USE_MIN_P: False | ||
USE_SUBSET: True # you will probably want to have use_subset on during testing and development to save money. | ||
CHUNK_SIZE: 2000 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,90 @@ | ||
import random | ||
import traceback | ||
from augmentoolkit.generation_functions.engine_wrapper_class import EngineWrapper | ||
from augmentoolkit.utils.write_output_to_file import write_output_to_file | ||
from BOILERPLATE_TO_MAKE_YOUR_OWN_PIPELINE.steps import API_KEY_A, API_KEY_B, BASE_URL_A, BASE_URL_B, CONCURRENCY_LIMIT, LOGICAL_MODEL_A, LOGICAL_MODEL_B, MODE_A, MODE_B, add_key, chunking_algorithm, count_tokens, make_id | ||
|
||
|
||
import nltk | ||
from tqdm import asyncio as tqdmasyncio | ||
|
||
|
||
import asyncio | ||
import glob | ||
import logging | ||
import os | ||
import sys | ||
import time | ||
import yaml | ||
|
||
config_path = os.environ["CONFIG_PATH"] | ||
with open (config_path, "r") as file: | ||
config = yaml.safe_load(file) | ||
|
||
WORK_IN_PHASES = bool(config["PHASES"]["WORK_IN_PHASES"]) | ||
PHASE_INDEX = int(config["PHASES"]["PHASE_INDEX"]) | ||
USE_SUBSET = bool(config["SYSTEM"]["USE_SUBSET"]) | ||
SUBSET_SIZE = int(config["SYSTEM"]["SUBSET_SIZE"]) | ||
CHUNK_SIZE = int(config["SYSTEM"]["CHUNK_SIZE"]) | ||
INPUT = config["PATH"]["INPUT"] | ||
|
||
|
||
async def main(): | ||
# NOTE Load the source texts | ||
print("Welcome to your test pipeline!") | ||
print(f"Input folder: {INPUT}") | ||
start_time = time.time() | ||
print("Begun") | ||
|
||
# Set up rate-limit-conscious functions | ||
semaphore = asyncio.Semaphore(CONCURRENCY_LIMIT) | ||
async def run_task_with_limit(task): | ||
async with semaphore: | ||
return await task | ||
|
||
extensions = [".txt", ".md"] | ||
|
||
source_texts = [] | ||
for extension in extensions: | ||
path = f"{INPUT}/**/*" + extension | ||
source_texts = source_texts + glob.glob(path, recursive=True) | ||
|
||
if source_texts: | ||
print(source_texts) | ||
else: | ||
print(f"No source texts found in: {INPUT}") | ||
|
||
# NOTE Initialize the Engine (or API client) | ||
engine_wrapper = EngineWrapper( | ||
model=LOGICAL_MODEL_A, | ||
api_key=API_KEY_A, | ||
base_url=BASE_URL_A, | ||
mode=MODE_A, | ||
) | ||
|
||
engine_wrapper_large = EngineWrapper( | ||
model=LOGICAL_MODEL_B, | ||
api_key=API_KEY_B, | ||
base_url=BASE_URL_B, | ||
mode=MODE_B, | ||
) | ||
|
||
# any HF path to a transformer model will do, as long as it has a tokenizer | ||
|
||
sentence_chunks = [] | ||
for source_text in source_texts: | ||
sentence_chunks += chunking_algorithm(source_text, max_token_length=CHUNK_SIZE) | ||
|
||
# NOTE Generate the data | ||
output_list = [] | ||
data_generations_tasks = [add_key(input_data=chunk, engine_wrapper=engine_wrapper_large, idx=idx, output_list=output_list) for idx, chunk in enumerate(sentence_chunks)] | ||
coroutines = [run_task_with_limit(task) for task in data_generations_tasks] | ||
for future in tqdmasyncio.tqdm.as_completed(coroutines): | ||
await future | ||
|
||
print(f"Time taken: {time.time() - start_time}") | ||
print("You generated some data! Check the output folder for the results.") | ||
print("here's one of the results: ") | ||
print(output_list[0]) | ||
|
||
asyncio.run(main()) |
44 changes: 44 additions & 0 deletions
44
BOILERPLATE_TO_MAKE_YOUR_OWN_PIPELINE/prompts/test_prompt.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,44 @@ | ||
- role: system | ||
content: | | ||
You are an expert poet. Write a poem inspired by what you are shown. | ||
- role: user | ||
content: | | ||
Input: | ||
When life gives you lemons, don't make lemonade. Make life TAKE THE LEMONS BACK. GET MAD! | ||
- role: assistant | ||
content: | # You should always have few-shot examples | ||
Defiance's Zest | ||
In life's orchard, sour fruits abound, | ||
Yellow globes fall, scatter the ground. | ||
Convention whispers, "Sweeten the tart," | ||
But rebellion stirs within my heart. | ||
No sugared drink shall I concoct, | ||
No lemonade stand will be stocked. | ||
These citrus spheres, I'll not accept, | ||
A different path, I have now stepped. | ||
With fire in eyes and strength in hand, | ||
I gather lemons from this land. | ||
Back to life, I'll hurl each one, | ||
A citrus revolt has now begun. | ||
"Take them back!" I shout with might, | ||
"Your sour gifts, I'll no more sight!" | ||
For I am not a passive receiver, | ||
But a bold and brazen achiever. | ||
Let life know, I write my tale, | ||
No bitter fruit shall make me quail. | ||
In face of sourness, I'll stand tall, | ||
And rewrite the rules of nature's call. | ||
So when life's lemons come your way, | ||
Remember you have the power to say: | ||
"Not today, life, not this time," | ||
And make defiance your paradigm. | ||
- role: user | ||
content: | # Note how interpolation is done here. The template names line up with the keys of the input object. | ||
Passage: | ||
{chunk} |
Oops, something went wrong.