Skip to content

Commit

Permalink
update assets
Browse files Browse the repository at this point in the history
  • Loading branch information
yan91083 committed Nov 28, 2023
1 parent c68811a commit aee55ac
Show file tree
Hide file tree
Showing 5 changed files with 131 additions and 43 deletions.
2 changes: 2 additions & 0 deletions python/tabby-eval/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
*tmp
tabby_data_pipeline.egg-info
4 changes: 2 additions & 2 deletions python/tabby-eval/tabby_data_pipeline/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,9 @@

from dagster import AssetIn, Field, Int, asset, file_relative_path

from . import assets
from . import assets, create_csv

all_assets = load_assets_from_modules([assets])
all_assets = load_assets_from_modules([assets, create_csv])

defs = Definitions(
assets=all_assets,
Expand Down
15 changes: 10 additions & 5 deletions python/tabby-eval/tabby_data_pipeline/analyze.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,14 @@
import pandas as pd
import json
import sys
#from eval_utils import postprocess_code_lines, remove_comments
#from tree_sitter import Language, Parser
import pandas as pd

from dagster import (
AssetExecutionContext,
MetadataValue,
asset,
StaticPartitionsDefinition,
MultiPartitionsDefinition,
)

def get_bracket_lang_statement(completion):
end_idx = None
Expand All @@ -22,6 +28,7 @@ def postprocess_code_lines(prompt, target, language):
except Exception as e:
return target


def analyze(model, language, file):

line_match = 0
Expand Down Expand Up @@ -78,5 +85,3 @@ def analyze(model, language, file):
fout.write("\n")




130 changes: 95 additions & 35 deletions python/tabby-eval/tabby_data_pipeline/assets.py
Original file line number Diff line number Diff line change
@@ -1,32 +1,75 @@
import modal
import json
import os, subprocess
import modal

import requests
import pandas as pd

import base64
from io import BytesIO

import matplotlib.pyplot as plt

from typing import Dict, List

from dagster import (
AssetExecutionContext,
MetadataValue,
asset,
get_dagster_logger,
op,
StaticPartitionsDefinition,
MultiPartitionsDefinition,
AssetIn,
Field,
Int,
file_relative_path
)
from . import analyze, create_csv
from dagstermill import define_dagstermill_asset
from . import analyze


@asset
def baseline() -> str:
return "line_completion.jsonl"

@asset
def bm25() -> str:
return "line_completion_rg1_bm25.jsonl"

@asset
def oracle() -> str:
return "line_completion_oracle_bm25.jsonl"

@asset(
partitions_def=MultiPartitionsDefinition(
{
"model_id" : StaticPartitionsDefinition(['TabbyML/StarCoder-1B', 'TabbyML/StarCoder-3B', 'TabbyML/StarCoder-7B', 'TabbyML/WizardCoder-1B', 'TabbyML/WizardCoder-3B', 'TabbyML/CodeLlama-7B', 'TabbyML/CodeLlama-13B']),
"language" : StaticPartitionsDefinition(["python", "java", "csharp", "typescript"]),
}
))
def predict_baseline(context: AssetExecutionContext, baseline: str) -> None:
model_id = context.partition_key.keys_by_dimension["model_id"]
language = context.partition_key.keys_by_dimension["language"]

my_env = os.environ.copy()
my_env["MODEL_ID"] = model_id

context.add_output_metadata(metadata={"model_id": MetadataValue.md(model_id)})

files = baseline

p = subprocess.Popen(["modal", "run", "./modal/predict.py","--language", language, "--files", files], env=my_env)
p.wait()
context.add_output_metadata(metadata={'modal run': MetadataValue.md("success!")})

@asset(
partitions_def=MultiPartitionsDefinition(
{
"model_id" : StaticPartitionsDefinition(['TabbyML/StarCoder-1B', 'TabbyML/StarCoder-3B', 'TabbyML/StarCoder-7B', 'TabbyML/WizardCoder-1B', 'TabbyML/WizardCoder-3B', 'TabbyML/CodeLlama-7B', 'TabbyML/CodeLlama-13B']),
"language" : StaticPartitionsDefinition(["python", "java", "csharp", "typescript"]),
}
))
def predict_bm25(context: AssetExecutionContext, bm25: str) -> None:
model_id = context.partition_key.keys_by_dimension["model_id"]
language = context.partition_key.keys_by_dimension["language"]

my_env = os.environ.copy()
my_env["MODEL_ID"] = model_id

context.add_output_metadata(metadata={"model_id": MetadataValue.md(model_id)})

files = bm25

p = subprocess.Popen(["modal", "run", "./modal/predict.py","--language", language, "--files", files], env=my_env)
p.wait()
context.add_output_metadata(metadata={'modal run': MetadataValue.md("success!")})


@asset(
Expand All @@ -37,7 +80,7 @@
}
))
def model_predict(context: AssetExecutionContext) -> None:
def predict_oracle(context: AssetExecutionContext, oracle: str) -> None:
model_id = context.partition_key.keys_by_dimension["model_id"]
language = context.partition_key.keys_by_dimension["language"]

Expand All @@ -46,42 +89,59 @@ def model_predict(context: AssetExecutionContext) -> None:

context.add_output_metadata(metadata={"model_id": MetadataValue.md(model_id)})

files = 'line_completion.jsonl, line_completion_rg1_bm25.jsonl, line_completion_oracle_bm25.jsonl'
files = oracle

p = subprocess.Popen(["modal", "run", "./modal/predict.py","--language", language, "--files", files], env=my_env)
p.wait()
context.add_output_metadata(metadata={'modal run': MetadataValue.md("success!")})



@asset(
partitions_def=MultiPartitionsDefinition(
{
"model_id" : StaticPartitionsDefinition(['TabbyML/StarCoder-1B', 'TabbyML/StarCoder-3B', 'TabbyML/StarCoder-7B', 'TabbyML/WizardCoder-1B', 'TabbyML/WizardCoder-3B', 'TabbyML/CodeLlama-7B', 'TabbyML/CodeLlama-13B']),
"language" : StaticPartitionsDefinition(["python", "java", "csharp", "typescript"]),
}
), deps=[model_predict])
def matching(context) -> None:
), deps=[predict_baseline])
def matching_baseline(context) -> None:
model_id = context.partition_key.keys_by_dimension["model_id"]
language = context.partition_key.keys_by_dimension["language"]


model = model_id.split("/")[-1]
for file in ["line_completion.jsonl", "line_completion_rg1_bm25.jsonl", "line_completion_oracle_bm25.jsonl"]:
analyze.analyze(model, language, file)
analyze.analyze(model, language, 'line_completion.jsonl')

@asset
def tabby_eval_result():
create_csv.create_csv()


@asset(
partitions_def=MultiPartitionsDefinition(
{
"model_id" : StaticPartitionsDefinition(['TabbyML/StarCoder-1B', 'TabbyML/StarCoder-3B', 'TabbyML/StarCoder-7B', 'TabbyML/WizardCoder-1B', 'TabbyML/WizardCoder-3B', 'TabbyML/CodeLlama-7B', 'TabbyML/CodeLlama-13B']),
"language" : StaticPartitionsDefinition(["python", "java", "csharp", "typescript"]),
}
), deps=[predict_bm25])
def matching_bm25(context) -> None:
model_id = context.partition_key.keys_by_dimension["model_id"]
language = context.partition_key.keys_by_dimension["language"]


model = model_id.split("/")[-1]
analyze.analyze(model, language, 'line_completion_rg1_bm25.jsonl')


@asset(deps=[tabby_eval_result])
def tabby_dataset():
return pd.read_csv(file_relative_path(__file__,'tabby.csv'))

tabby_jupyter_notebook = define_dagstermill_asset(
name = 'tabby_jupyter',
notebook_path = file_relative_path(__file__, "tabby_eval.ipynb"),
ins={"df": AssetIn("tabby_dataset")},
)
@asset(
partitions_def=MultiPartitionsDefinition(
{
"model_id" : StaticPartitionsDefinition(['TabbyML/StarCoder-1B', 'TabbyML/StarCoder-3B', 'TabbyML/StarCoder-7B', 'TabbyML/WizardCoder-1B', 'TabbyML/WizardCoder-3B', 'TabbyML/CodeLlama-7B', 'TabbyML/CodeLlama-13B']),
"language" : StaticPartitionsDefinition(["python", "java", "csharp", "typescript"]),
}
), deps=[predict_oracle])
def matching_oracle(context) -> None:
model_id = context.partition_key.keys_by_dimension["model_id"]
language = context.partition_key.keys_by_dimension["language"]


model = model_id.split("/")[-1]
analyze.analyze(model, language, 'line_completion_oracle_bm25.jsonl')
23 changes: 22 additions & 1 deletion python/tabby-eval/tabby_data_pipeline/create_csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,15 @@
import json
import pandas as pd

from dagster import (
asset,
AssetIn,
file_relative_path
)
from dagstermill import define_dagstermill_asset



models = ["StarCoder-1B", "StarCoder-3B", "StarCoder-7B", "CodeLlama-7B", "CodeLlama-13B", "WizardCoder-1B", "WizardCoder-3B", "DeepseekCoder-1.3B", "DeepseekCoder-6.7B"]
languages = {"csharp": "C#", "java": "Java", "python": "Python", "typescript": "Typescript"}
files = ["line_completion.jsonl", 'line_completion_rg1_bm25.jsonl', 'line_completion_oracle_bm25.jsonl']
Expand All @@ -20,6 +29,7 @@ def get_match(model, language, file):

return count

@asset
def create_csv():
for model in models:
for language in languages.keys():
Expand All @@ -32,4 +42,15 @@ def create_csv():
df = pd.DataFrame(stat, columns=headers)
print(df)

df.to_csv('./tabby_data_pipeline/tabby.csv', index=False)
df.to_csv('./tabby_data_pipeline/tabby.csv', index=False)


@asset(deps=[create_csv])
def tabby_dataset():
return pd.read_csv(file_relative_path(__file__,'tabby.csv'))

tabby_jupyter_notebook = define_dagstermill_asset(
name = 'tabby_jupyter',
notebook_path = file_relative_path(__file__, "tabby_eval.ipynb"),
ins={"df": AssetIn("tabby_dataset")},
)

0 comments on commit aee55ac

Please sign in to comment.