-
Notifications
You must be signed in to change notification settings - Fork 22
/
1-transform.py
42 lines (35 loc) · 1.4 KB
/
1-transform.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
# PART 1 – Finetuning Llama-3 on your own data (transforming data)
# Make sure to run `pip install together`
import json
from together.utils import check_file
datasetName = "TrainMathInstruct-500"
old_file_path = f"{datasetName}.json"
new_file_path = f"Formatted{datasetName}.jsonl"
# Load old format JSON data
with open(old_file_path, "r", encoding="utf-8") as old_file:
old_data = json.load(old_file)
# Define Llama-3 prompt and system prompt
llama_format = """
<|begin_of_text|><|start_header_id|>system<|end_header_id|>
{system_prompt}<|eot_id|><|start_header_id|>user<|end_header_id|>
{user_question}<|eot_id|><|start_header_id|>assistant<|end_header_id|>
{model_answer}<|eot_id|>
"""
formatted_data = []
system_prompt = "You're a helpful assistant that answers math problems."
# Transform the data into the right format and write it to a JSONL file
with open(new_file_path, "w", encoding="utf-8") as new_file:
for piece in old_data:
temp_data = {
"text": llama_format.format(
system_prompt=system_prompt,
user_question=piece["instruction"],
model_answer=piece["output"],
)
}
new_file.write(json.dumps(temp_data))
new_file.write("\n")
# We're going to check to see that the file is in the right format before we finetune
report = check_file(new_file_path)
print(report)
assert report["is_check_passed"] == True