From aee55ace3af53e180d20d4777f97cfbf18fae3c2 Mon Sep 17 00:00:00 2001 From: yan91083 Date: Tue, 28 Nov 2023 11:06:40 -0800 Subject: [PATCH] update assets --- python/tabby-eval/.gitignore | 2 + .../tabby_data_pipeline/__init__.py | 4 +- .../tabby-eval/tabby_data_pipeline/analyze.py | 15 +- .../tabby-eval/tabby_data_pipeline/assets.py | 130 +++++++++++++----- .../tabby_data_pipeline/create_csv.py | 23 +++- 5 files changed, 131 insertions(+), 43 deletions(-) create mode 100644 python/tabby-eval/.gitignore diff --git a/python/tabby-eval/.gitignore b/python/tabby-eval/.gitignore new file mode 100644 index 000000000000..e98198e48d50 --- /dev/null +++ b/python/tabby-eval/.gitignore @@ -0,0 +1,2 @@ +*tmp +tabby_data_pipeline.egg-info \ No newline at end of file diff --git a/python/tabby-eval/tabby_data_pipeline/__init__.py b/python/tabby-eval/tabby_data_pipeline/__init__.py index 503a23759246..11c46715f2df 100644 --- a/python/tabby-eval/tabby_data_pipeline/__init__.py +++ b/python/tabby-eval/tabby_data_pipeline/__init__.py @@ -4,9 +4,9 @@ from dagster import AssetIn, Field, Int, asset, file_relative_path -from . import assets +from . import assets, create_csv -all_assets = load_assets_from_modules([assets]) +all_assets = load_assets_from_modules([assets, create_csv]) defs = Definitions( assets=all_assets, diff --git a/python/tabby-eval/tabby_data_pipeline/analyze.py b/python/tabby-eval/tabby_data_pipeline/analyze.py index 3a46383c18fe..b8523d6249e7 100644 --- a/python/tabby-eval/tabby_data_pipeline/analyze.py +++ b/python/tabby-eval/tabby_data_pipeline/analyze.py @@ -1,8 +1,14 @@ +import pandas as pd import json import sys -#from eval_utils import postprocess_code_lines, remove_comments -#from tree_sitter import Language, Parser -import pandas as pd + +from dagster import ( + AssetExecutionContext, + MetadataValue, + asset, + StaticPartitionsDefinition, + MultiPartitionsDefinition, +) def get_bracket_lang_statement(completion): end_idx = None @@ -22,6 +28,7 @@ def postprocess_code_lines(prompt, target, language): except Exception as e: return target + def analyze(model, language, file): line_match = 0 @@ -78,5 +85,3 @@ def analyze(model, language, file): fout.write("\n") - - diff --git a/python/tabby-eval/tabby_data_pipeline/assets.py b/python/tabby-eval/tabby_data_pipeline/assets.py index 953ea99bbd2d..28413a27c9c3 100644 --- a/python/tabby-eval/tabby_data_pipeline/assets.py +++ b/python/tabby-eval/tabby_data_pipeline/assets.py @@ -1,32 +1,75 @@ +import modal import json import os, subprocess -import modal - -import requests import pandas as pd -import base64 -from io import BytesIO - -import matplotlib.pyplot as plt - -from typing import Dict, List - from dagster import ( AssetExecutionContext, MetadataValue, asset, - get_dagster_logger, - op, StaticPartitionsDefinition, MultiPartitionsDefinition, - AssetIn, - Field, - Int, - file_relative_path ) -from . import analyze, create_csv -from dagstermill import define_dagstermill_asset +from . import analyze + + +@asset +def baseline() -> str: + return "line_completion.jsonl" + +@asset +def bm25() -> str: + return "line_completion_rg1_bm25.jsonl" + +@asset +def oracle() -> str: + return "line_completion_oracle_bm25.jsonl" + +@asset( + partitions_def=MultiPartitionsDefinition( + { + "model_id" : StaticPartitionsDefinition(['TabbyML/StarCoder-1B', 'TabbyML/StarCoder-3B', 'TabbyML/StarCoder-7B', 'TabbyML/WizardCoder-1B', 'TabbyML/WizardCoder-3B', 'TabbyML/CodeLlama-7B', 'TabbyML/CodeLlama-13B']), + "language" : StaticPartitionsDefinition(["python", "java", "csharp", "typescript"]), + + } + )) +def predict_baseline(context: AssetExecutionContext, baseline: str) -> None: + model_id = context.partition_key.keys_by_dimension["model_id"] + language = context.partition_key.keys_by_dimension["language"] + + my_env = os.environ.copy() + my_env["MODEL_ID"] = model_id + + context.add_output_metadata(metadata={"model_id": MetadataValue.md(model_id)}) + + files = baseline + + p = subprocess.Popen(["modal", "run", "./modal/predict.py","--language", language, "--files", files], env=my_env) + p.wait() + context.add_output_metadata(metadata={'modal run': MetadataValue.md("success!")}) + +@asset( + partitions_def=MultiPartitionsDefinition( + { + "model_id" : StaticPartitionsDefinition(['TabbyML/StarCoder-1B', 'TabbyML/StarCoder-3B', 'TabbyML/StarCoder-7B', 'TabbyML/WizardCoder-1B', 'TabbyML/WizardCoder-3B', 'TabbyML/CodeLlama-7B', 'TabbyML/CodeLlama-13B']), + "language" : StaticPartitionsDefinition(["python", "java", "csharp", "typescript"]), + + } + )) +def predict_bm25(context: AssetExecutionContext, bm25: str) -> None: + model_id = context.partition_key.keys_by_dimension["model_id"] + language = context.partition_key.keys_by_dimension["language"] + + my_env = os.environ.copy() + my_env["MODEL_ID"] = model_id + + context.add_output_metadata(metadata={"model_id": MetadataValue.md(model_id)}) + + files = bm25 + + p = subprocess.Popen(["modal", "run", "./modal/predict.py","--language", language, "--files", files], env=my_env) + p.wait() + context.add_output_metadata(metadata={'modal run': MetadataValue.md("success!")}) @asset( @@ -37,7 +80,7 @@ } )) -def model_predict(context: AssetExecutionContext) -> None: +def predict_oracle(context: AssetExecutionContext, oracle: str) -> None: model_id = context.partition_key.keys_by_dimension["model_id"] language = context.partition_key.keys_by_dimension["language"] @@ -46,42 +89,59 @@ def model_predict(context: AssetExecutionContext) -> None: context.add_output_metadata(metadata={"model_id": MetadataValue.md(model_id)}) - files = 'line_completion.jsonl, line_completion_rg1_bm25.jsonl, line_completion_oracle_bm25.jsonl' + files = oracle p = subprocess.Popen(["modal", "run", "./modal/predict.py","--language", language, "--files", files], env=my_env) p.wait() context.add_output_metadata(metadata={'modal run': MetadataValue.md("success!")}) + @asset( partitions_def=MultiPartitionsDefinition( { "model_id" : StaticPartitionsDefinition(['TabbyML/StarCoder-1B', 'TabbyML/StarCoder-3B', 'TabbyML/StarCoder-7B', 'TabbyML/WizardCoder-1B', 'TabbyML/WizardCoder-3B', 'TabbyML/CodeLlama-7B', 'TabbyML/CodeLlama-13B']), "language" : StaticPartitionsDefinition(["python", "java", "csharp", "typescript"]), } - ), deps=[model_predict]) -def matching(context) -> None: + ), deps=[predict_baseline]) +def matching_baseline(context) -> None: model_id = context.partition_key.keys_by_dimension["model_id"] language = context.partition_key.keys_by_dimension["language"] model = model_id.split("/")[-1] - for file in ["line_completion.jsonl", "line_completion_rg1_bm25.jsonl", "line_completion_oracle_bm25.jsonl"]: - analyze.analyze(model, language, file) + analyze.analyze(model, language, 'line_completion.jsonl') -@asset -def tabby_eval_result(): - create_csv.create_csv() +@asset( + partitions_def=MultiPartitionsDefinition( + { + "model_id" : StaticPartitionsDefinition(['TabbyML/StarCoder-1B', 'TabbyML/StarCoder-3B', 'TabbyML/StarCoder-7B', 'TabbyML/WizardCoder-1B', 'TabbyML/WizardCoder-3B', 'TabbyML/CodeLlama-7B', 'TabbyML/CodeLlama-13B']), + "language" : StaticPartitionsDefinition(["python", "java", "csharp", "typescript"]), + } + ), deps=[predict_bm25]) +def matching_bm25(context) -> None: + model_id = context.partition_key.keys_by_dimension["model_id"] + language = context.partition_key.keys_by_dimension["language"] + + + model = model_id.split("/")[-1] + analyze.analyze(model, language, 'line_completion_rg1_bm25.jsonl') + -@asset(deps=[tabby_eval_result]) -def tabby_dataset(): - return pd.read_csv(file_relative_path(__file__,'tabby.csv')) -tabby_jupyter_notebook = define_dagstermill_asset( - name = 'tabby_jupyter', - notebook_path = file_relative_path(__file__, "tabby_eval.ipynb"), - ins={"df": AssetIn("tabby_dataset")}, -) +@asset( + partitions_def=MultiPartitionsDefinition( + { + "model_id" : StaticPartitionsDefinition(['TabbyML/StarCoder-1B', 'TabbyML/StarCoder-3B', 'TabbyML/StarCoder-7B', 'TabbyML/WizardCoder-1B', 'TabbyML/WizardCoder-3B', 'TabbyML/CodeLlama-7B', 'TabbyML/CodeLlama-13B']), + "language" : StaticPartitionsDefinition(["python", "java", "csharp", "typescript"]), + } + ), deps=[predict_oracle]) +def matching_oracle(context) -> None: + model_id = context.partition_key.keys_by_dimension["model_id"] + language = context.partition_key.keys_by_dimension["language"] + + model = model_id.split("/")[-1] + analyze.analyze(model, language, 'line_completion_oracle_bm25.jsonl') \ No newline at end of file diff --git a/python/tabby-eval/tabby_data_pipeline/create_csv.py b/python/tabby-eval/tabby_data_pipeline/create_csv.py index 72080f1323cb..70b04ae5ce4e 100644 --- a/python/tabby-eval/tabby_data_pipeline/create_csv.py +++ b/python/tabby-eval/tabby_data_pipeline/create_csv.py @@ -2,6 +2,15 @@ import json import pandas as pd +from dagster import ( + asset, + AssetIn, + file_relative_path + ) +from dagstermill import define_dagstermill_asset + + + models = ["StarCoder-1B", "StarCoder-3B", "StarCoder-7B", "CodeLlama-7B", "CodeLlama-13B", "WizardCoder-1B", "WizardCoder-3B", "DeepseekCoder-1.3B", "DeepseekCoder-6.7B"] languages = {"csharp": "C#", "java": "Java", "python": "Python", "typescript": "Typescript"} files = ["line_completion.jsonl", 'line_completion_rg1_bm25.jsonl', 'line_completion_oracle_bm25.jsonl'] @@ -20,6 +29,7 @@ def get_match(model, language, file): return count +@asset def create_csv(): for model in models: for language in languages.keys(): @@ -32,4 +42,15 @@ def create_csv(): df = pd.DataFrame(stat, columns=headers) print(df) - df.to_csv('./tabby_data_pipeline/tabby.csv', index=False) \ No newline at end of file + df.to_csv('./tabby_data_pipeline/tabby.csv', index=False) + + +@asset(deps=[create_csv]) +def tabby_dataset(): + return pd.read_csv(file_relative_path(__file__,'tabby.csv')) + +tabby_jupyter_notebook = define_dagstermill_asset( + name = 'tabby_jupyter', + notebook_path = file_relative_path(__file__, "tabby_eval.ipynb"), + ins={"df": AssetIn("tabby_dataset")}, +)